From 829b558b0c3131646ee7f05d9e088758aa137616 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Wed, 29 May 2024 10:19:35 +0200 Subject: [PATCH 01/61] ignores types on sqlalchemy 1.4 (#1419) --- .../website/docs/dlt-ecosystem/verified-sources/sql_database.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index 1891157b4a..4a80de1bdf 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -179,7 +179,7 @@ pipeline = dlt.pipeline( def _double_as_decimal_adapter(table: sa.Table) -> None: """Return double as double, not decimals, this is mysql thing""" for column in table.columns.values(): - if isinstance(column.type, sa.Double): + if isinstance(column.type, sa.Double): # type: ignore column.type.asdecimal = False sql_alchemy_source = sql_database( From 9f17a21ecfe646930844051669aaaaba79597bef Mon Sep 17 00:00:00 2001 From: Marcel Coetzee <34739235+Pipboyguy@users.noreply.github.com> Date: Mon, 3 Jun 2024 12:16:01 +0200 Subject: [PATCH 02/61] 1272 Support ClickHouse GCS S3 compatibility mode in filesystem destination (#1423) * Add HMAC credentials and update Clickhouse configuration Signed-off-by: Marcel Coetzee * Revert "Add HMAC credentials and update Clickhouse configuration" This reverts commit cb80c6be5e4fc651bce0fb0aef430d998c281f4e. * Refactor error handling for storage authentication in Clickhouse Signed-off-by: Marcel Coetzee * Revert "Refactor error handling for storage authentication in Clickhouse" This reverts commit f24eb1da5fe89f774fb2717e3679d817ec2d778e. * Remove GCS ClickHouse buckets in CI until named destinations are supported Signed-off-by: Marcel Coetzee * Add GCS S3 compatibility test, remove GCP credentials from Clickhouse Signed-off-by: Marcel Coetzee * Refactor ClickHouse test code for better readability Signed-off-by: Marcel Coetzee * Refactor endpoint handling and update GCS bucket configuration Signed-off-by: Marcel Coetzee * Refactor test for clickhouse gcs_s3 compatibility Signed-off-by: Marcel Coetzee * Update ClickHouse docs and tests for S3-compatible staging Signed-off-by: Marcel Coetzee * Update ClickHouse documentation on staging areas Signed-off-by: Marcel Coetzee --------- Signed-off-by: Marcel Coetzee --- .../impl/clickhouse/clickhouse.py | 28 +++++---- dlt/destinations/impl/clickhouse/utils.py | 3 +- .../dlt-ecosystem/destinations/clickhouse.md | 59 ++++++++----------- .../test_clickhouse_gcs_s3_compatibility.py | 28 +++++++++ tests/load/utils.py | 14 ----- 5 files changed, 70 insertions(+), 62 deletions(-) create mode 100644 tests/load/clickhouse/test_clickhouse_gcs_s3_compatibility.py diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index e2c1f827bc..cf1f1bc857 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -1,6 +1,7 @@ import os import re from copy import deepcopy +from textwrap import dedent from typing import ClassVar, Optional, Dict, List, Sequence, cast, Tuple from urllib.parse import urlparse @@ -201,22 +202,23 @@ def __init__( compression = "none" if config.get("data_writer.disable_compression") else "gz" if bucket_scheme in ("s3", "gs", "gcs"): - # get auth and bucket url - bucket_http_url = convert_storage_to_http_scheme(bucket_url) - access_key_id: str = None - secret_access_key: str = None if isinstance(staging_credentials, AwsCredentialsWithoutDefaults): + bucket_http_url = convert_storage_to_http_scheme( + bucket_url, endpoint=staging_credentials.endpoint_url + ) access_key_id = staging_credentials.aws_access_key_id secret_access_key = staging_credentials.aws_secret_access_key - elif isinstance(staging_credentials, GcpCredentials): - access_key_id = client.credentials.gcp_access_key_id - secret_access_key = client.credentials.gcp_secret_access_key - if not access_key_id or not secret_access_key: - raise DestinationTransientException( - "You have tried loading from gcs with clickhouse. Please provide valid" - " 'gcp_access_key_id' and 'gcp_secret_access_key' to connect to gcs as" - " outlined in the dlthub docs." - ) + else: + raise LoadJobTerminalException( + file_path, + dedent( + """ + Google Cloud Storage buckets must be configured using the S3 compatible access pattern. + Please provide the necessary S3 credentials (access key ID and secret access key), to access the GCS bucket through the S3 API. + Refer to https://dlthub.com/docs/dlt-ecosystem/destinations/filesystem#using-s3-compatible-storage. + """, + ).strip(), + ) auth = "NOSIGN" if access_key_id and secret_access_key: diff --git a/dlt/destinations/impl/clickhouse/utils.py b/dlt/destinations/impl/clickhouse/utils.py index b0b06909f9..0e2fa3db00 100644 --- a/dlt/destinations/impl/clickhouse/utils.py +++ b/dlt/destinations/impl/clickhouse/utils.py @@ -25,11 +25,10 @@ def convert_storage_to_http_scheme( protocol = "https" if use_https else "http" if endpoint: - domain = endpoint + domain = endpoint.replace("https://", "").replace("http://", "") elif region and parsed_url.scheme == "s3": domain = f"s3-{region}.amazonaws.com" else: - # TODO: Incorporate dlt.config endpoint. storage_domains = { "s3": "s3.amazonaws.com", "gs": "storage.googleapis.com", diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md index 58551751c5..b1dde5a328 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md @@ -115,12 +115,14 @@ destination. The `clickhouse` destination has a few specific deviations from the default sql destinations: -1. `Clickhouse` has an experimental `object` datatype, but we have found it to be a bit unpredictable, so the dlt clickhouse destination will load the complex datatype to a `text` column. If you need this feature, get in touch with our Slack community, and we will consider adding it. +1. `Clickhouse` has an experimental `object` datatype, but we have found it to be a bit unpredictable, so the dlt clickhouse destination will load the complex datatype to a `text` column. If you need + this feature, get in touch with our Slack community, and we will consider adding it. 2. `Clickhouse` does not support the `time` datatype. Time will be loaded to a `text` column. 3. `Clickhouse` does not support the `binary` datatype. Binary will be loaded to a `text` column. When loading from `jsonl`, this will be a base64 string, when loading from parquet this will be the `binary` object converted to `text`. 4. `Clickhouse` accepts adding columns to a populated table that are not null. -5. `Clickhouse` can produce rounding errors under certain conditions when using the float / double datatype. Make sure to use decimal if you cannot afford to have rounding errors. Loading the value 12.7001 to a double column with the loader file format jsonl set will predictbly produce a rounding error for example. +5. `Clickhouse` can produce rounding errors under certain conditions when using the float / double datatype. Make sure to use decimal if you cannot afford to have rounding errors. Loading the value + 12.7001 to a double column with the loader file format jsonl set will predictbly produce a rounding error for example. ## Supported column hints @@ -173,51 +175,42 @@ pipeline = dlt.pipeline( ) ``` -### Using Google Cloud Storage as a Staging Area +### Using Google Cloud or S3-Compatible Storage as a Staging Area -dlt supports using Google Cloud Storage (GCS) as a staging area when loading data into ClickHouse. This is handled automatically by -ClickHouse's [GCS table function](https://clickhouse.com/docs/en/sql-reference/table-functions/gcs) which dlt uses under the hood. +dlt supports using S3-compatible storage services, including Google Cloud Storage (GCS), as a staging area when loading data into ClickHouse. +This is handled automatically by +ClickHouse's [GCS table function](https://clickhouse.com/docs/en/sql-reference/table-functions/gcs), which dlt uses under the hood. -The clickhouse GCS table function only supports authentication using Hash-based Message Authentication Code (HMAC) keys. To enable this, GCS provides an S3 compatibility mode that emulates -the Amazon S3 -API. ClickHouse takes advantage of this to allow accessing GCS buckets via its S3 integration. +The ClickHouse GCS table function only supports authentication using Hash-based Message Authentication Code (HMAC) keys, which is compatible with the Amazon S3 API. +To enable this, GCS provides an S3 +compatibility mode that emulates the S3 API, allowing ClickHouse to access GCS buckets via its S3 integration. + +For detailed instructions on setting up S3-compatible storage with dlt, including AWS S3, MinIO, and Cloudflare R2, refer to +the [dlt documentation on filesystem destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/filesystem#using-s3-compatible-storage). To set up GCS staging with HMAC authentication in dlt: 1. Create HMAC keys for your GCS service account by following the [Google Cloud guide](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create). -2. Configure the HMAC keys as well as the `client_email`, `project_id` and `private_key` for your service account in your dlt project's ClickHouse destination settings in `config.toml`: +2. Configure the HMAC keys (`aws_access_key_id` and `aws_secret_access_key`) in your dlt project's ClickHouse destination settings in `config.toml`, similar to how you would configure AWS S3 + credentials: ```toml [destination.filesystem] -bucket_url = "gs://dlt-ci" +bucket_url = "s3://my_awesome_bucket" [destination.filesystem.credentials] -project_id = "a-cool-project" -client_email = "my-service-account@a-cool-project.iam.gserviceaccount.com" -private_key = "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkaslkdjflasjnkdcopauihj...wEiEx7y+mx\nNffxQBqVVej2n/D93xY99pM=\n-----END PRIVATE KEY-----\n" - -[destination.clickhouse.credentials] -database = "dlt" -username = "dlt" -password = "Dlt*12345789234567" -host = "localhost" -port = 9440 -secure = 1 -gcp_access_key_id = "JFJ$$*f2058024835jFffsadf" -gcp_secret_access_key = "DFJdwslf2hf57)%$02jaflsedjfasoi" +aws_access_key_id = "JFJ$$*f2058024835jFffsadf" +aws_secret_access_key = "DFJdwslf2hf57)%$02jaflsedjfasoi" +project_id = "my-awesome-project" +endpoint_url = "https://storage.googleapis.com" ``` -Note: In addition to the HMAC keys (`gcp_access_key_id` and `gcp_secret_access_key`), you now need to provide the `client_email`, `project_id` and `private_key` for your service account -under `[destination.filesystem.credentials]`. -This is because the GCS staging support is now implemented as a temporary workaround and is still unoptimized. - -dlt will pass these credentials to ClickHouse which will handle the authentication and GCS access. - -There is active work in progress to simplify and improve the GCS staging setup for the ClickHouse dlt destination in the future. Proper GCS staging support is being tracked in these GitHub issues: - -- [Make filesystem destination work with gcs in s3 compatibility mode](https://github.com/dlt-hub/dlt/issues/1272) -- [GCS staging area support](https://github.com/dlt-hub/dlt/issues/1181) +:::caution +When configuring the `bucket_url` for S3-compatible storage services like Google Cloud Storage (GCS) with ClickHouse in dlt, ensure that the URL is prepended with `s3://` instead of `gs://`. This is +because the ClickHouse GCS table function requires the use of HMAC credentials, which are compatible with the S3 API. Prepending with `s3://` allows the HMAC credentials to integrate properly with +dlt's staging mechanisms for ClickHouse. +::: ### dbt support diff --git a/tests/load/clickhouse/test_clickhouse_gcs_s3_compatibility.py b/tests/load/clickhouse/test_clickhouse_gcs_s3_compatibility.py new file mode 100644 index 0000000000..481cd420c6 --- /dev/null +++ b/tests/load/clickhouse/test_clickhouse_gcs_s3_compatibility.py @@ -0,0 +1,28 @@ +from typing import Generator, Dict + +import pytest + +import dlt +from dlt.destinations import filesystem +from tests.load.utils import GCS_BUCKET +from tests.pipeline.utils import assert_load_info + + +@pytest.mark.essential +def test_clickhouse_gcs_s3_compatibility() -> None: + @dlt.resource + def dummy_data() -> Generator[Dict[str, int], None, None]: + yield {"field1": 1, "field2": 2} + + gcp_bucket = filesystem( + GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp" + ) + + pipe = dlt.pipeline( + pipeline_name="gcs_s3_compatibility", + destination="clickhouse", + staging=gcp_bucket, + full_refresh=True, + ) + pack = pipe.run([dummy_data]) + assert_load_info(pack) diff --git a/tests/load/utils.py b/tests/load/utils.py index e6b860c723..445f8d815b 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -345,13 +345,6 @@ def destinations_configs( extra_info="az-authorization", disable_compression=True, ), - DestinationTestConfiguration( - destination="clickhouse", - staging="filesystem", - file_format="parquet", - bucket_url=GCS_BUCKET, - extra_info="gcs-authorization", - ), DestinationTestConfiguration( destination="clickhouse", staging="filesystem", @@ -373,13 +366,6 @@ def destinations_configs( bucket_url=AZ_BUCKET, extra_info="az-authorization", ), - DestinationTestConfiguration( - destination="clickhouse", - staging="filesystem", - file_format="jsonl", - bucket_url=GCS_BUCKET, - extra_info="gcs-authorization", - ), DestinationTestConfiguration( destination="clickhouse", staging="filesystem", From 6db85a6ff3be1c30321a78507e93ba2525091230 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Mon, 3 Jun 2024 16:16:47 +0200 Subject: [PATCH 03/61] disablel qdrant tests (#1438) --- .github/workflows/test_destination_qdrant.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_destination_qdrant.yml b/.github/workflows/test_destination_qdrant.yml index 938778fe9f..168fe315ce 100644 --- a/.github/workflows/test_destination_qdrant.yml +++ b/.github/workflows/test_destination_qdrant.yml @@ -32,7 +32,8 @@ jobs: run_loader: name: dest | qdrant tests needs: get_docs_changes - if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + # if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + if: false # TODO re-enable with above line defaults: run: shell: bash From cbed22540e17158a003e54623ede913e6a9efa59 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Mon, 3 Jun 2024 10:18:14 -0400 Subject: [PATCH 04/61] New "refresh" mode and "dev_mode" (#1063) * Rename full_refresh -> dev_mode, add deprecation warning * Replace some full_refresh usage in code and docs * Replace full_refresh usage in tests * Init experimental refresh = full with drop command * Refresh modes with dropped_tables file * Init separate local/write drop command * Use load_package_state instead of drop tables file * Use drop schema in init_client (TODO: error) * Separate cli drop command instructions/execute * drop tables in init_client * dropped_tables field on load package state * Fix import * test/fix truncate mode * Save truncated tables in load package state * Remove load package state copying * cleanup * Drop cmd use package state, refactoring * Don't drop tables without data * Validate literals in configspec * Match stored schema by version+version_hash solves detecting when dropped tables need to be recreated * Cleanup * Fix dlt version test * Cleanup * Remove dropped_tables_filename * Fix snippet * Pipeline refresh docs * refresh argument docstring * Restore and update commented drop cmd tests * Cleanup refresh tests and test whether table dropped vs truncated * Test existing schema hash * Revert "Match stored schema by version+version_hash" This reverts commit 689b3ca2239f1ed62b21021071686b02cf2072e2. * Use replace_schema flag * Change drop_tables replace_schema to delete_schema * Refresh drop only selected sources * Rename refresh modes, update docs * pipeline.run/extract refresh argument * Don't modify schema when refresh='drop_data' * Move refresh tests to load, add filesystem truncate test * Fix duck import * Remove generated doc sections * Default full_refresh=None * Cleanup unused imports * Close caution blocks * Update config field docstring * Add filesystem drop_tables method * Run all refresh tests on local filesystem destination * Fix test drop * Fix iter filesystem schemas * Fix drop_resources * Default config.full_refresh also None * Fix filesystem + test --- dlt/cli/deploy_command_helpers.py | 13 +- dlt/common/configuration/resolve.py | 2 +- .../configuration/specs/base_configuration.py | 12 +- dlt/common/configuration/utils.py | 31 +- dlt/common/pipeline.py | 13 + dlt/common/schema/schema.py | 11 + dlt/common/storages/load_package.py | 7 +- dlt/common/typing.py | 12 +- .../impl/filesystem/filesystem.py | 34 +- dlt/destinations/job_client_impl.py | 18 +- dlt/extract/extract.py | 15 +- dlt/helpers/airflow_helper.py | 12 +- dlt/load/load.py | 25 +- dlt/load/utils.py | 44 +- dlt/pipeline/__init__.py | 32 +- dlt/pipeline/configuration.py | 7 +- dlt/pipeline/drop.py | 171 +++++++ dlt/pipeline/helpers.py | 236 ++++------ dlt/pipeline/pipeline.py | 100 +++- dlt/pipeline/warnings.py | 13 + docs/examples/archive/google_sheets.py | 2 +- docs/examples/archive/quickstart.py | 2 +- docs/examples/archive/rasa_example.py | 2 +- .../archive/singer_tap_jsonl_example.py | 2 +- docs/examples/chess/chess.py | 4 +- .../website/docs/build-a-pipeline-tutorial.md | 2 +- .../docs/dlt-ecosystem/destinations/duckdb.md | 7 +- .../dlt-ecosystem/transformations/pandas.md | 2 +- .../verified-sources/salesforce.md | 2 +- .../dlt-ecosystem/verified-sources/zendesk.md | 8 +- .../visualizations/exploring-the-data.md | 2 +- .../docs/general-usage/destination-tables.md | 6 +- .../docs/general-usage/incremental-loading.md | 2 +- docs/website/docs/general-usage/pipeline.md | 40 +- docs/website/docs/general-usage/state.md | 2 +- .../performance-snippets.py | 6 +- .../docs/walkthroughs/adjust-a-schema.md | 6 +- .../deploy-with-airflow-composer.md | 2 +- .../cases/deploy_pipeline/debug_pipeline.py | 2 +- .../configuration/test_configuration.py | 23 + tests/extract/test_sources.py | 8 +- .../local/test_runner_destinations.py | 2 +- .../athena_iceberg/test_athena_iceberg.py | 2 +- .../bigquery/test_bigquery_table_builder.py | 28 +- tests/load/pipeline/test_athena.py | 6 +- tests/load/pipeline/test_dbt_helper.py | 6 +- tests/load/pipeline/test_drop.py | 45 +- .../load/pipeline/test_filesystem_pipeline.py | 5 +- tests/load/pipeline/test_merge_disposition.py | 28 +- tests/load/pipeline/test_pipelines.py | 22 +- tests/load/pipeline/test_redshift.py | 2 +- tests/load/pipeline/test_refresh_modes.py | 439 ++++++++++++++++++ .../load/pipeline/test_replace_disposition.py | 2 +- tests/load/pipeline/test_restore_state.py | 2 +- .../test_write_disposition_changes.py | 4 +- tests/load/qdrant/test_pipeline.py | 4 +- .../synapse/test_synapse_table_indexing.py | 4 +- tests/load/test_job_client.py | 6 +- tests/load/utils.py | 4 +- tests/load/weaviate/test_pipeline.py | 4 +- .../cases/github_pipeline/github_extract.py | 4 +- .../cases/github_pipeline/github_pipeline.py | 4 +- tests/pipeline/test_pipeline.py | 22 +- tests/pipeline/test_resources_evaluation.py | 12 +- tests/pipeline/test_schema_contracts.py | 2 +- tests/pipeline/test_schema_updates.py | 2 +- tests/pipeline/utils.py | 42 +- 67 files changed, 1263 insertions(+), 380 deletions(-) create mode 100644 dlt/pipeline/drop.py create mode 100644 tests/load/pipeline/test_refresh_modes.py diff --git a/dlt/cli/deploy_command_helpers.py b/dlt/cli/deploy_command_helpers.py index 5065ba1cfc..5fe46415dd 100644 --- a/dlt/cli/deploy_command_helpers.py +++ b/dlt/cli/deploy_command_helpers.py @@ -263,22 +263,25 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio if n.PIPELINE in visitor.known_calls: for call_args in visitor.known_calls[n.PIPELINE]: pipeline_name, pipelines_dir = None, None - f_r_node = call_args.arguments.get("full_refresh") + # Check both full_refresh/dev_mode until full_refresh option is removed from dlt + f_r_node = call_args.arguments.get("full_refresh") or call_args.arguments.get( + "dev_mode" + ) if f_r_node: f_r_value = evaluate_node_literal(f_r_node) if f_r_value is None: fmt.warning( - "The value of `full_refresh` in call to `dlt.pipeline` cannot be" + "The value of `dev_mode` in call to `dlt.pipeline` cannot be" f" determined from {unparse(f_r_node).strip()}. We assume that you know" " what you are doing :)" ) if f_r_value is True: if fmt.confirm( - "The value of 'full_refresh' is set to True. Do you want to abort to set it" - " to False?", + "The value of 'dev_mode' or 'full_refresh' is set to True. Do you want to" + " abort to set it to False?", default=True, ): - raise CliCommandException("deploy", "Please set the full_refresh to False") + raise CliCommandException("deploy", "Please set the dev_mode to False") p_d_node = call_args.arguments.get("pipelines_dir") if p_d_node: diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index 9101cfdd9c..c9644713b5 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -286,7 +286,7 @@ def _resolve_config_field( embedded_sections: Tuple[str, ...], accept_partial: bool, ) -> Tuple[Any, List[LookupTrace]]: - inner_hint = extract_inner_hint(hint) + inner_hint = extract_inner_hint(hint, preserve_literal=True) if explicit_value is not None: value = explicit_value diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 006cde8dce..0456a5374a 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -19,6 +19,7 @@ overload, ClassVar, TypeVar, + Literal, ) from typing_extensions import get_args, get_origin, dataclass_transform, Annotated, TypeAlias from functools import wraps @@ -120,13 +121,18 @@ def is_valid_hint(hint: Type[Any]) -> bool: return False -def extract_inner_hint(hint: Type[Any], preserve_new_types: bool = False) -> Type[Any]: +def extract_inner_hint( + hint: Type[Any], preserve_new_types: bool = False, preserve_literal: bool = False +) -> Type[Any]: # extract hint from Optional / Literal / NewType hints - inner_hint = extract_inner_type(hint, preserve_new_types) + inner_hint = extract_inner_type(hint, preserve_new_types, preserve_literal) # get base configuration from union type inner_hint = get_config_if_union_hint(inner_hint) or inner_hint # extract origin from generic types (ie List[str] -> List) - return get_origin(inner_hint) or inner_hint + origin = get_origin(inner_hint) or inner_hint + if preserve_literal and origin is Literal: + return inner_hint + return origin or inner_hint def is_secret_hint(hint: Type[Any]) -> bool: diff --git a/dlt/common/configuration/utils.py b/dlt/common/configuration/utils.py index 51e6b5615a..8f3c1789ce 100644 --- a/dlt/common/configuration/utils.py +++ b/dlt/common/configuration/utils.py @@ -2,7 +2,20 @@ import ast import contextlib import tomlkit -from typing import Any, Dict, Mapping, NamedTuple, Optional, Tuple, Type, Sequence +from typing import ( + Any, + Dict, + Mapping, + NamedTuple, + Optional, + Tuple, + Type, + Sequence, + get_args, + Literal, + get_origin, + List, +) from collections.abc import Mapping as C_Mapping from dlt.common.json import json @@ -51,25 +64,35 @@ def deserialize_value(key: str, value: Any, hint: Type[TAny]) -> TAny: raise return c # type: ignore + literal_values: Tuple[Any, ...] = () + if get_origin(hint) is Literal: + # Literal fields are validated against the literal values + literal_values = get_args(hint) + hint_origin = type(literal_values[0]) + else: + hint_origin = hint + # coerce value - hint_dt = py_type_to_sc_type(hint) + hint_dt = py_type_to_sc_type(hint_origin) value_dt = py_type_to_sc_type(type(value)) # eval only if value is string and hint is "complex" if value_dt == "text" and hint_dt == "complex": - if hint is tuple: + if hint_origin is tuple: # use literal eval for tuples value = ast.literal_eval(value) else: # use json for sequences and mappings value = json.loads(value) # exact types must match - if not isinstance(value, hint): + if not isinstance(value, hint_origin): raise ValueError(value) else: # for types that are not complex, reuse schema coercion rules if value_dt != hint_dt: value = coerce_value(hint_dt, value_dt, value) + if literal_values and value not in literal_values: + raise ConfigValueCannotBeCoercedException(key, value, hint) return value # type: ignore except ConfigValueCannotBeCoercedException: raise diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 8baf872752..6cefdd9e6c 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -21,6 +21,7 @@ TypeVar, TypedDict, Mapping, + Literal, ) from typing_extensions import NotRequired @@ -52,6 +53,10 @@ from dlt.common.versioned_state import TVersionedState +# TRefreshMode = Literal["full", "replace"] +TRefreshMode = Literal["drop_sources", "drop_resources", "drop_data"] + + class _StepInfo(NamedTuple): pipeline: "SupportsPipeline" loads_ids: List[str] @@ -762,6 +767,14 @@ def reset_resource_state(resource_name: str, source_state_: Optional[DictStrAny] state_["resources"].pop(resource_name) +def _get_matching_sources( + pattern: REPattern, pipeline_state: Optional[TPipelineState] = None, / +) -> List[str]: + """Get all source names in state matching the regex pattern""" + state_ = _sources_state(pipeline_state) + return [key for key in state_ if pattern.match(key)] + + def _get_matching_resources( pattern: REPattern, source_state_: Optional[DictStrAny] = None, / ) -> List[str]: diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index fb7ad226f1..6d5dc48907 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -438,6 +438,17 @@ def update_schema(self, schema: "Schema") -> None: self._settings = deepcopy(schema.settings) self._compile_settings() + def drop_tables( + self, table_names: Sequence[str], seen_data_only: bool = False + ) -> List[TTableSchema]: + """Drops tables from the schema and returns the dropped tables""" + result = [] + for table_name in table_names: + table = self.tables.get(table_name) + if table and (not seen_data_only or utils.has_table_seen_data(table)): + result.append(self._schema_tables.pop(table_name)) + return result + def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: StrAny) -> StrAny: rv_row: DictStrAny = {} column_prop: TColumnProp = utils.hint_to_column_prop(hint_type) diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index 1752039775..e7c7f7a164 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -35,7 +35,7 @@ from dlt.common.destination import TLoaderFileFormat from dlt.common.exceptions import TerminalValueError from dlt.common.schema import Schema, TSchemaTables -from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns +from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TTableSchema from dlt.common.storages import FileStorage from dlt.common.storages.exceptions import LoadPackageNotFound, CurrentLoadPackageStateNotAvailable from dlt.common.typing import DictStrAny, SupportsHumanize @@ -76,6 +76,11 @@ class TLoadPackageState(TVersionedState, total=False): destination_state: NotRequired[Dict[str, Any]] """private space for destinations to store state relevant only to the load package""" + dropped_tables: NotRequired[List[TTableSchema]] + """List of tables that are to be dropped from the schema and destination (i.e. when `refresh` mode is used)""" + truncated_tables: NotRequired[List[TTableSchema]] + """List of tables that are to be truncated in the destination (i.e. when `refresh='drop_data'` mode is used)""" + class TLoadPackage(TypedDict, total=False): load_id: str diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 7490dc6e53..2d46f367d8 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -245,7 +245,9 @@ def is_dict_generic_type(t: Type[Any]) -> bool: return False -def extract_inner_type(hint: Type[Any], preserve_new_types: bool = False) -> Type[Any]: +def extract_inner_type( + hint: Type[Any], preserve_new_types: bool = False, preserve_literal: bool = False +) -> Type[Any]: """Gets the inner type from Literal, Optional, Final and NewType Args: @@ -256,15 +258,15 @@ def extract_inner_type(hint: Type[Any], preserve_new_types: bool = False) -> Typ Type[Any]: Inner type if hint was Literal, Optional or NewType, otherwise hint """ if maybe_modified := extract_type_if_modifier(hint): - return extract_inner_type(maybe_modified, preserve_new_types) + return extract_inner_type(maybe_modified, preserve_new_types, preserve_literal) if is_optional_type(hint): - return extract_inner_type(get_args(hint)[0], preserve_new_types) - if is_literal_type(hint): + return extract_inner_type(get_args(hint)[0], preserve_new_types, preserve_literal) + if is_literal_type(hint) and not preserve_literal: # assume that all literals are of the same type return type(get_args(hint)[0]) if is_newtype_type(hint) and not preserve_new_types: # descend into supertypes of NewType - return extract_inner_type(hint.__supertype__, preserve_new_types) + return extract_inner_type(hint.__supertype__, preserve_new_types, preserve_literal) return hint diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 5070ff061c..d75226be13 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -171,6 +171,15 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: self.fs_client.makedirs(self.dataset_path, exist_ok=True) self.fs_client.touch(self.pathlib.join(self.dataset_path, INIT_FILE_NAME)) + def drop_tables(self, *tables: str, delete_schema: bool = True) -> None: + self.truncate_tables(list(tables)) + if not delete_schema: + return + # Delete all stored schemas + for filename, fileparts in self._iter_stored_schema_files(): + if fileparts[0] == self.schema.name: + self._delete_file(filename) + def truncate_tables(self, table_names: List[str]) -> None: """Truncate table with given name""" table_dirs = set(self.get_table_dirs(table_names)) @@ -180,19 +189,23 @@ def truncate_tables(self, table_names: List[str]) -> None: # NOTE: deleting in chunks on s3 does not raise on access denied, file non existing and probably other errors # print(f"DEL {table_file}") try: - # NOTE: must use rm_file to get errors on delete - self.fs_client.rm_file(table_file) - except NotImplementedError: - # not all filesystem implement the above - self.fs_client.rm(table_file) - if self.fs_client.exists(table_file): - raise FileExistsError(table_file) + self._delete_file(table_file) except FileNotFoundError: logger.info( f"Directory or path to truncate tables {table_names} does not exist but" " it should have been created previously!" ) + def _delete_file(self, file_path: str) -> None: + try: + # NOTE: must use rm_file to get errors on delete + self.fs_client.rm_file(file_path) + except NotImplementedError: + # not all filesystems implement the above + self.fs_client.rm(file_path) + if self.fs_client.exists(file_path): + raise FileExistsError(file_path) + def update_stored_schema( self, only_tables: Iterable[str] = None, @@ -401,6 +414,11 @@ def _get_schema_file_name(self, version_hash: str, load_id: str) -> str: f"{self.schema.name}{FILENAME_SEPARATOR}{load_id}{FILENAME_SEPARATOR}{self._to_path_safe_string(version_hash)}.jsonl", ) + def _iter_stored_schema_files(self) -> Iterator[Tuple[str, List[str]]]: + """Iterator over all stored schema files""" + for filepath, fileparts in self._list_dlt_table_files(self.schema.version_table_name): + yield filepath, fileparts + def _get_stored_schema_by_hash_or_newest( self, version_hash: str = None ) -> Optional[StorageSchemaInfo]: @@ -409,7 +427,7 @@ def _get_stored_schema_by_hash_or_newest( # find newest schema for pipeline or by version hash selected_path = None newest_load_id = "0" - for filepath, fileparts in self._list_dlt_table_files(self.schema.version_table_name): + for filepath, fileparts in self._iter_stored_schema_files(): if ( not version_hash and fileparts[0] == self.schema.name diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 5838ab2ab7..853972fcba 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -202,11 +202,18 @@ def update_stored_schema( ) return applied_update - def drop_tables(self, *tables: str, replace_schema: bool = True) -> None: + def drop_tables(self, *tables: str, delete_schema: bool = True) -> None: + """Drop tables in destination database and optionally delete the stored schema as well. + Clients that support ddl transactions will have both operations performed in a single transaction. + + Args: + tables: Names of tables to drop. + delete_schema: If True, also delete all versions of the current schema from storage + """ with self.maybe_ddl_transaction(): self.sql_client.drop_tables(*tables) - if replace_schema: - self._replace_schema_in_storage(self.schema) + if delete_schema: + self._delete_schema_in_storage(self.schema) @contextlib.contextmanager def maybe_ddl_transaction(self) -> Iterator[None]: @@ -520,13 +527,12 @@ def _row_to_schema_info(self, query: str, *args: Any) -> StorageSchemaInfo: return StorageSchemaInfo(row[0], row[1], row[2], row[3], inserted_at, schema_str) - def _replace_schema_in_storage(self, schema: Schema) -> None: + def _delete_schema_in_storage(self, schema: Schema) -> None: """ - Save the given schema in storage and remove all previous versions with the same name + Delete all stored versions with the same name as given schema """ name = self.sql_client.make_qualified_table_name(self.schema.version_table_name) self.sql_client.execute_sql(f"DELETE FROM {name} WHERE schema_name = %s;", schema.name) - self._update_schema_in_storage(schema) def _update_schema_in_storage(self, schema: Schema) -> None: # get schema string or zip diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index d4298f2f6b..009cd8cc53 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -2,7 +2,7 @@ from collections.abc import Sequence as C_Sequence from copy import copy import itertools -from typing import Iterator, List, Dict, Any +from typing import Iterator, List, Dict, Any, Optional import yaml from dlt.common.configuration.container import Container @@ -32,9 +32,8 @@ ParsedLoadJobFileName, LoadPackageStateInjectableContext, TPipelineStateDoc, + commit_load_package_state, ) - - from dlt.common.utils import get_callable_name, get_full_class_name from dlt.extract.decorators import SourceInjectableContext, SourceSchemaInjectableContext @@ -46,6 +45,7 @@ from dlt.extract.storage import ExtractStorage from dlt.extract.extractors import ObjectExtractor, ArrowExtractor, Extractor from dlt.extract.utils import get_data_item_format +from dlt.pipeline.drop import drop_resources def data_to_sources( @@ -369,6 +369,7 @@ def extract( source: DltSource, max_parallel_items: int, workers: int, + load_package_state_update: Optional[Dict[str, Any]] = None, ) -> str: # generate load package to be able to commit all the sources together later load_id = self.extract_storage.create_load_package(source.discover_schema()) @@ -378,9 +379,9 @@ def extract( SourceInjectableContext(source) ), Container().injectable_context( LoadPackageStateInjectableContext( - storage=self.extract_storage.new_packages, load_id=load_id + load_id=load_id, storage=self.extract_storage.new_packages ) - ): + ) as load_package: # inject the config section with the current source name with inject_section( ConfigSectionContext( @@ -388,6 +389,9 @@ def extract( source_state_key=source.name, ) ): + if load_package_state_update: + load_package.state.update(load_package_state_update) # type: ignore[typeddict-item] + # reset resource states, the `extracted` list contains all the explicit resources and all their parents for resource in source.resources.extracted.values(): with contextlib.suppress(DataItemRequiredForDynamicTableHints): @@ -400,6 +404,7 @@ def extract( max_parallel_items=max_parallel_items, workers=workers, ) + commit_load_package_state() return load_id def commit_packages(self, pipline_state_doc: TPipelineStateDoc = None) -> None: diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index e68c330765..7d7302aab6 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -432,8 +432,8 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator elif decompose == "serialize": if not isinstance(data, DltSource): raise ValueError("Can only decompose dlt sources") - if pipeline.full_refresh: - raise ValueError("Cannot decompose pipelines with full_refresh set") + if pipeline.dev_mode: + raise ValueError("Cannot decompose pipelines with dev_mode set") # serialize tasks tasks = [] pt = None @@ -448,8 +448,8 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator if not isinstance(data, DltSource): raise ValueError("Can only decompose dlt sources") - if pipeline.full_refresh: - raise ValueError("Cannot decompose pipelines with full_refresh set") + if pipeline.dev_mode: + raise ValueError("Cannot decompose pipelines with dev_mode set") tasks = [] sources = data.decompose("scc") @@ -484,8 +484,8 @@ def make_task(pipeline: Pipeline, data: Any, name: str = None) -> PythonOperator if not isinstance(data, DltSource): raise ValueError("Can only decompose dlt sources") - if pipeline.full_refresh: - raise ValueError("Cannot decompose pipelines with full_refresh set") + if pipeline.dev_mode: + raise ValueError("Cannot decompose pipelines with dev_mode set") # parallel tasks tasks = [] diff --git a/dlt/load/load.py b/dlt/load/load.py index 9d898bc54d..d96a6b7116 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -1,24 +1,24 @@ import contextlib from functools import reduce import datetime # noqa: 251 -from typing import Dict, List, Optional, Tuple, Set, Iterator, Iterable +from typing import Dict, List, Optional, Tuple, Set, Iterator, Iterable, Sequence from concurrent.futures import Executor import os +from copy import deepcopy from dlt.common import logger from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.accessors import config -from dlt.common.pipeline import ( - LoadInfo, - LoadMetrics, - SupportsPipeline, - WithStepInfo, -) +from dlt.common.pipeline import LoadInfo, LoadMetrics, SupportsPipeline, WithStepInfo from dlt.common.schema.utils import get_top_level_table +from dlt.common.schema.typing import TTableSchema from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState -from dlt.common.storages.load_package import LoadPackageStateInjectableContext +from dlt.common.storages.load_package import ( + LoadPackageStateInjectableContext, + load_package as current_load_package, +) from dlt.common.runners import TRunMetrics, Runnable, workermethod, NullExecutor from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.logger import pretty_format_exception @@ -362,6 +362,9 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) def load_single_package(self, load_id: str, schema: Schema) -> None: new_jobs = self.get_new_jobs_info(load_id) + + dropped_tables = current_load_package()["state"].get("dropped_tables", []) + truncated_tables = current_load_package()["state"].get("truncated_tables", []) # initialize analytical storage ie. create dataset required by passed schema with self.get_destination_client(schema) as job_client: if (expected_update := self.load_storage.begin_schema_update(load_id)) is not None: @@ -377,6 +380,8 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: if isinstance(job_client, WithStagingDataset) else None ), + drop_tables=dropped_tables, + truncate_tables=truncated_tables, ) # init staging client @@ -385,6 +390,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: f"Job client for destination {self.destination.destination_type} does not" " implement SupportsStagingDestination" ) + with self.get_staging_destination_client(schema) as staging_client: init_client( staging_client, @@ -392,7 +398,10 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: new_jobs, expected_update, job_client.should_truncate_table_before_load_on_staging_destination, + # should_truncate_staging, job_client.should_load_data_to_staging_dataset_on_staging_destination, + drop_tables=dropped_tables, + truncate_tables=truncated_tables, ) self.load_storage.commit_schema_update(load_id, applied_update) diff --git a/dlt/load/utils.py b/dlt/load/utils.py index 067ae33613..5126cbd11e 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -1,4 +1,4 @@ -from typing import List, Set, Iterable, Callable +from typing import List, Set, Iterable, Callable, Optional from dlt.common import logger from dlt.common.storages.load_package import LoadJobInfo, PackageStorage @@ -30,7 +30,10 @@ def get_completed_table_chain( # returns ordered list of tables from parent to child leaf tables table_chain: List[TTableSchema] = [] # allow for jobless tables for those write disposition - skip_jobless_table = top_merged_table["write_disposition"] not in ("replace", "merge") + skip_jobless_table = top_merged_table["write_disposition"] not in ( + "replace", + "merge", + ) # make sure all the jobs for the table chain is completed for table in map( @@ -66,6 +69,8 @@ def init_client( expected_update: TSchemaTables, truncate_filter: Callable[[TTableSchema], bool], load_staging_filter: Callable[[TTableSchema], bool], + drop_tables: Optional[List[TTableSchema]] = None, + truncate_tables: Optional[List[TTableSchema]] = None, ) -> TSchemaTables: """Initializes destination storage including staging dataset if supported @@ -78,12 +83,15 @@ def init_client( expected_update (TSchemaTables): Schema update as in load package. Always present even if empty truncate_filter (Callable[[TTableSchema], bool]): A filter that tells which table in destination dataset should be truncated load_staging_filter (Callable[[TTableSchema], bool]): A filter which tell which table in the staging dataset may be loaded into + drop_tables (Optional[List[TTableSchema]]): List of tables to drop before initializing storage + truncate_tables (Optional[List[TTableSchema]]): List of tables to truncate before initializing storage Returns: TSchemaTables: Actual migrations done at destination """ # get dlt/internal tables dlt_tables = set(schema.dlt_table_names()) + # tables without data (TODO: normalizer removes such jobs, write tests and remove the line below) tables_no_data = set( table["name"] for table in schema.data_tables() if not has_table_seen_data(table) @@ -92,12 +100,22 @@ def init_client( tables_with_jobs = set(job.table_name for job in new_jobs) - tables_no_data # get tables to truncate by extending tables with jobs with all their child tables - truncate_tables = set( - _extend_tables_with_table_chain(schema, tables_with_jobs, tables_with_jobs, truncate_filter) + initial_truncate_names = set(t["name"] for t in truncate_tables) if truncate_tables else set() + truncate_table_names = set( + _extend_tables_with_table_chain( + schema, + tables_with_jobs, + tables_with_jobs, + lambda t: truncate_filter(t) or t["name"] in initial_truncate_names, + ) ) applied_update = _init_dataset_and_update_schema( - job_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables + job_client, + expected_update, + tables_with_jobs | dlt_tables, + truncate_table_names, + drop_tables=drop_tables, ) # update the staging dataset if client supports this @@ -128,6 +146,7 @@ def _init_dataset_and_update_schema( update_tables: Iterable[str], truncate_tables: Iterable[str] = None, staging_info: bool = False, + drop_tables: Optional[List[TTableSchema]] = None, ) -> TSchemaTables: staging_text = "for staging dataset" if staging_info else "" logger.info( @@ -135,16 +154,26 @@ def _init_dataset_and_update_schema( f" {staging_text}" ) job_client.initialize_storage() + if drop_tables: + drop_table_names = [table["name"] for table in drop_tables] + if hasattr(job_client, "drop_tables"): + logger.info( + f"Client for {job_client.config.destination_type} will drop tables {staging_text}" + ) + job_client.drop_tables(*drop_table_names, delete_schema=True) + logger.info( f"Client for {job_client.config.destination_type} will update schema to package schema" f" {staging_text}" ) + applied_update = job_client.update_stored_schema( only_tables=update_tables, expected_update=expected_update ) logger.info( f"Client for {job_client.config.destination_type} will truncate tables {staging_text}" ) + job_client.initialize_storage(truncate_tables=truncate_tables) return applied_update @@ -167,7 +196,10 @@ def _extend_tables_with_table_chain( # for replace and merge write dispositions we should include tables # without jobs in the table chain, because child tables may need # processing due to changes in the root table - skip_jobless_table = top_job_table["write_disposition"] not in ("replace", "merge") + skip_jobless_table = top_job_table["write_disposition"] not in ( + "replace", + "merge", + ) for table in map( lambda t: fill_hints_from_parent_and_clone_table(schema.tables, t), get_child_tables(schema.tables, top_job_table["name"]), diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index c9e7b5097c..f8900ae562 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -1,4 +1,4 @@ -from typing import Sequence, Type, cast, overload +from typing import Sequence, Type, cast, overload, Optional from typing_extensions import TypeVar from dlt.common.schema import Schema @@ -9,12 +9,12 @@ from dlt.common.configuration.container import Container from dlt.common.configuration.inject import get_orig_args, last_config from dlt.common.destination import TLoaderFileFormat, Destination, TDestinationReferenceArg -from dlt.common.pipeline import LoadInfo, PipelineContext, get_dlt_pipelines_dir +from dlt.common.pipeline import LoadInfo, PipelineContext, get_dlt_pipelines_dir, TRefreshMode from dlt.pipeline.configuration import PipelineConfiguration, ensure_correct_pipeline_kwargs from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.progress import _from_name as collector_from_name, TCollectorArg, _NULL_COLLECTOR -from dlt.pipeline.warnings import credentials_argument_deprecated +from dlt.pipeline.warnings import credentials_argument_deprecated, full_refresh_argument_deprecated TPipeline = TypeVar("TPipeline", bound=Pipeline, default=Pipeline) @@ -29,7 +29,9 @@ def pipeline( dataset_name: str = None, import_schema_path: str = None, export_schema_path: str = None, - full_refresh: bool = False, + full_refresh: Optional[bool] = None, + dev_mode: bool = False, + refresh: Optional[TRefreshMode] = None, credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, _impl_cls: Type[TPipeline] = Pipeline, # type: ignore[assignment] @@ -67,9 +69,15 @@ def pipeline( export_schema_path (str, optional): A path where the schema `yaml` file will be exported after every schema change. Defaults to None which disables exporting. - full_refresh (bool, optional): When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset. + dev_mode (bool, optional): When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset. The datasets are identified by `dataset_name_` + datetime suffix. Use this setting whenever you experiment with your data to be sure you start fresh on each run. Defaults to False. + refresh (str | TRefreshMode): Fully or partially reset sources during pipeline run. When set here the refresh is applied on each run of the pipeline. + To apply refresh only once you can pass it to `pipeline.run` or `extract` instead. The following refresh modes are supported: + * `drop_sources`: Drop tables and source and resource state for all sources currently being processed in `run` or `extract` methods of the pipeline. (Note: schema history is erased) + * `drop_resources`: Drop tables and resource state for all resources being processed. Source level state is not modified. (Note: schema history is erased) + * `drop_data`: Wipe all data and resource state for all resources being processed. Schema is not modified. + credentials (Any, optional): Credentials for the `destination` ie. database connection string or a dictionary with google cloud credentials. In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer right credentials values. @@ -98,7 +106,9 @@ def pipeline( dataset_name: str = None, import_schema_path: str = None, export_schema_path: str = None, - full_refresh: bool = False, + full_refresh: Optional[bool] = None, + dev_mode: bool = False, + refresh: Optional[TRefreshMode] = None, credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, _impl_cls: Type[TPipeline] = Pipeline, # type: ignore[assignment] @@ -111,6 +121,7 @@ def pipeline( has_arguments = bool(orig_args[0]) or any(orig_args[1].values()) credentials_argument_deprecated("pipeline", credentials, destination) + full_refresh_argument_deprecated("pipeline", full_refresh) if not has_arguments: context = Container()[PipelineContext] @@ -144,11 +155,12 @@ def pipeline( credentials, import_schema_path, export_schema_path, - full_refresh, + full_refresh if full_refresh is not None else dev_mode, progress, False, last_config(**kwargs), kwargs["runtime"], + refresh=refresh, ) # set it as current pipeline p.activate() @@ -160,13 +172,15 @@ def attach( pipeline_name: str = None, pipelines_dir: str = None, pipeline_salt: TSecretValue = None, - full_refresh: bool = False, + full_refresh: Optional[bool] = None, + dev_mode: bool = False, credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, **kwargs: Any, ) -> Pipeline: """Attaches to the working folder of `pipeline_name` in `pipelines_dir` or in default directory. Requires that valid pipeline state exists in working folder.""" ensure_correct_pipeline_kwargs(attach, **kwargs) + full_refresh_argument_deprecated("attach", full_refresh) # if working_dir not provided use temp folder if not pipelines_dir: pipelines_dir = get_dlt_pipelines_dir() @@ -182,7 +196,7 @@ def attach( credentials, None, None, - full_refresh, + full_refresh if full_refresh is not None else dev_mode, progress, True, last_config(**kwargs), diff --git a/dlt/pipeline/configuration.py b/dlt/pipeline/configuration.py index 8c46ed049f..235ba3485a 100644 --- a/dlt/pipeline/configuration.py +++ b/dlt/pipeline/configuration.py @@ -5,6 +5,7 @@ from dlt.common.typing import AnyFun, TSecretValue from dlt.common.utils import digest256 from dlt.common.destination import TLoaderFileFormat +from dlt.common.pipeline import TRefreshMode @configspec @@ -24,10 +25,14 @@ class PipelineConfiguration(BaseConfiguration): """Enables the tracing. Tracing saves the execution trace locally and is required by `dlt deploy`.""" use_single_dataset: bool = True """Stores all schemas in single dataset. When False, each schema will get a separate dataset with `{dataset_name}_{schema_name}""" - full_refresh: bool = False + full_refresh: Optional[bool] = None + """Deprecated. Use `dev_mode` instead. When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset.""" + dev_mode: bool = False """When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset.""" progress: Optional[str] = None runtime: RunConfiguration = None + refresh: Optional[TRefreshMode] = None + """Refresh mode for the pipeline to fully or partially reset a source during run. See docstring of `dlt.pipeline` for more details.""" def on_resolved(self) -> None: if not self.pipeline_name: diff --git a/dlt/pipeline/drop.py b/dlt/pipeline/drop.py new file mode 100644 index 0000000000..486bead2f4 --- /dev/null +++ b/dlt/pipeline/drop.py @@ -0,0 +1,171 @@ +from typing import Union, Iterable, Optional, List, Dict, Any, Tuple, TypedDict +from copy import deepcopy +from itertools import chain +from dataclasses import dataclass + +from dlt.common.schema import Schema +from dlt.common.pipeline import ( + TPipelineState, + _sources_state, + _get_matching_resources, + _get_matching_sources, + reset_resource_state, + _delete_source_state_keys, +) +from dlt.common.schema.typing import TSimpleRegex, TTableSchema +from dlt.common.schema.utils import ( + group_tables_by_resource, + compile_simple_regexes, + compile_simple_regex, +) +from dlt.common import jsonpath +from dlt.common.typing import REPattern + + +class _DropInfo(TypedDict): + tables: List[str] + resource_states: List[str] + resource_names: List[str] + state_paths: List[str] + schema_name: str + dataset_name: Optional[str] + drop_all: bool + resource_pattern: Optional[REPattern] + warnings: List[str] + + +@dataclass +class _DropResult: + schema: Schema + state: TPipelineState + info: _DropInfo + dropped_tables: List[TTableSchema] + + +def _create_modified_state( + state: TPipelineState, + resource_pattern: Optional[REPattern], + source_pattern: REPattern, + state_paths: jsonpath.TAnyJsonPath, + info: _DropInfo, +) -> Tuple[TPipelineState, _DropInfo]: + # if not self.drop_state: + # return state # type: ignore[return-value] + all_source_states = _sources_state(state) + for source_name in _get_matching_sources(source_pattern, state): + source_state = all_source_states[source_name] + # drop table states + if resource_pattern: + for key in _get_matching_resources(resource_pattern, source_state): + info["resource_states"].append(key) + reset_resource_state(key, source_state) + # drop additional state paths + # Don't drop 'resources' key if jsonpath is wildcard + resolved_paths = [ + p for p in jsonpath.resolve_paths(state_paths, source_state) if p != "resources" + ] + if state_paths and not resolved_paths: + info["warnings"].append( + f"State paths {state_paths} did not select any paths in source {source_name}" + ) + _delete_source_state_keys(resolved_paths, source_state) + info["state_paths"].extend(f"{source_name}.{p}" for p in resolved_paths) + return state, info + + +def drop_resources( + schema: Schema, + state: TPipelineState, + resources: Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]] = (), + state_paths: jsonpath.TAnyJsonPath = (), + drop_all: bool = False, + state_only: bool = False, + sources: Optional[Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]]] = None, +) -> _DropResult: + """Generate a new schema and pipeline state with the requested resources removed. + + Args: + schema: The schema to modify. + state: The pipeline state to modify. + resources: Resource name(s) or regex pattern(s) matching resource names to drop. + If empty, no resources will be dropped unless `drop_all` is True. + state_paths: JSON path(s) relative to the source state to drop. + drop_all: If True, all resources will be dropped (supeseeds `resources`). + state_only: If True, only modify the pipeline state, not schema + sources: Only wipe state for sources matching the name(s) or regex pattern(s) in this list + If not set all source states will be modified according to `state_paths` and `resources` + + Returns: + A 3 part tuple containing the new schema, the new pipeline state, and a dictionary + containing information about the drop operation. + """ + + if isinstance(resources, str): + resources = [resources] + resources = list(resources) + if isinstance(sources, str): + sources = [sources] + if sources is not None: + sources = list(sources) + if isinstance(state_paths, str): + state_paths = [state_paths] + + state_paths = jsonpath.compile_paths(state_paths) + + schema = schema.clone() + state = deepcopy(state) + + resources = set(resources) + if drop_all: + resource_pattern = compile_simple_regex(TSimpleRegex("re:.*")) # Match everything + elif resources: + resource_pattern = compile_simple_regexes(TSimpleRegex(r) for r in resources) + else: + resource_pattern = None + if sources is not None: + source_pattern = compile_simple_regexes(TSimpleRegex(s) for s in sources) + else: + source_pattern = compile_simple_regex(TSimpleRegex("re:.*")) # Match everything + + if resource_pattern: + data_tables = { + t["name"]: t for t in schema.data_tables(seen_data_only=True) + } # Don't remove _dlt tables + resource_tables = group_tables_by_resource(data_tables, pattern=resource_pattern) + resource_names = list(resource_tables.keys()) + # TODO: If drop_tables + if not state_only: + tables_to_drop = list(chain.from_iterable(resource_tables.values())) + tables_to_drop.reverse() + else: + tables_to_drop = [] + else: + tables_to_drop = [] + resource_names = [] + + info: _DropInfo = dict( + tables=[t["name"] for t in tables_to_drop], + resource_states=[], + state_paths=[], + resource_names=resource_names, + schema_name=schema.name, + dataset_name=None, + drop_all=drop_all, + resource_pattern=resource_pattern, + warnings=[], + ) + + new_state, info = _create_modified_state( + state, resource_pattern, source_pattern, state_paths, info + ) + info["resource_names"] = resource_names + + if resource_pattern and not resource_tables: + info["warnings"].append( + f"Specified resource(s) {str(resources)} did not select any table(s) in schema" + f" {schema.name}. Possible resources are:" + f" {list(group_tables_by_resource(data_tables).keys())}" + ) + + dropped_tables = schema.drop_tables([t["name"] for t in tables_to_drop], seen_data_only=True) + return _DropResult(schema, new_state, info, dropped_tables) diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py index c1c3326171..0defbc14eb 100644 --- a/dlt/pipeline/helpers.py +++ b/dlt/pipeline/helpers.py @@ -1,25 +1,19 @@ -import contextlib -from typing import Callable, Sequence, Iterable, Optional, Any, List, Dict, Union, TypedDict -from itertools import chain +from copy import deepcopy +from typing import ( + Callable, + Sequence, + Iterable, + Optional, + Any, + Dict, + Union, + TYPE_CHECKING, +) -from dlt.common.jsonpath import resolve_paths, TAnyJsonPath, compile_paths +from dlt.common.jsonpath import TAnyJsonPath from dlt.common.exceptions import TerminalException -from dlt.common.schema.utils import ( - group_tables_by_resource, - compile_simple_regexes, - compile_simple_regex, -) from dlt.common.schema.typing import TSimpleRegex -from dlt.common.typing import REPattern -from dlt.common.pipeline import ( - reset_resource_state, - _sources_state, - _delete_source_state_keys, - _get_matching_resources, -) -from dlt.common.destination.reference import WithStagingDataset - -from dlt.destinations.exceptions import DatabaseUndefinedRelation +from dlt.common.pipeline import pipeline_state as current_pipeline_state, TRefreshMode from dlt.pipeline.exceptions import ( PipelineNeverRan, PipelineStepFailed, @@ -27,7 +21,11 @@ ) from dlt.pipeline.state_sync import force_state_extract from dlt.pipeline.typing import TPipelineStep -from dlt.pipeline import Pipeline +from dlt.pipeline.drop import drop_resources +from dlt.extract import DltSource + +if TYPE_CHECKING: + from dlt.pipeline import Pipeline def retry_load( @@ -62,87 +60,48 @@ def _retry_load(ex: BaseException) -> bool: return _retry_load -class _DropInfo(TypedDict): - tables: List[str] - resource_states: List[str] - resource_names: List[str] - state_paths: List[str] - schema_name: str - dataset_name: str - drop_all: bool - resource_pattern: Optional[REPattern] - warnings: List[str] - - class DropCommand: def __init__( self, - pipeline: Pipeline, + pipeline: "Pipeline", resources: Union[Iterable[Union[str, TSimpleRegex]], Union[str, TSimpleRegex]] = (), schema_name: Optional[str] = None, state_paths: TAnyJsonPath = (), drop_all: bool = False, state_only: bool = False, ) -> None: + """ + Args: + pipeline: Pipeline to drop tables and state from + resources: List of resources to drop. If empty, no resources are dropped unless `drop_all` is True + schema_name: Name of the schema to drop tables from. If not specified, the default schema is used + state_paths: JSON path(s) relative to the source state to drop + drop_all: Drop all resources and tables in the schema (supersedes `resources` list) + state_only: Drop only state, not tables + """ self.pipeline = pipeline - if isinstance(resources, str): - resources = [resources] - if isinstance(state_paths, str): - state_paths = [state_paths] if not pipeline.default_schema_name: raise PipelineNeverRan(pipeline.pipeline_name, pipeline.pipelines_dir) - self.schema = pipeline.schemas[schema_name or pipeline.default_schema_name].clone() - self.schema_tables = self.schema.tables - self.drop_tables = not state_only - self.drop_state = True - self.state_paths_to_drop = compile_paths(state_paths) - - resources = set(resources) - resource_names = [] - if drop_all: - self.resource_pattern = compile_simple_regex(TSimpleRegex("re:.*")) # Match everything - elif resources: - self.resource_pattern = compile_simple_regexes(TSimpleRegex(r) for r in resources) - else: - self.resource_pattern = None - - if self.resource_pattern: - data_tables = { - t["name"]: t for t in self.schema.data_tables() - } # Don't remove _dlt tables - resource_tables = group_tables_by_resource(data_tables, pattern=self.resource_pattern) - if self.drop_tables: - self.tables_to_drop = list(chain.from_iterable(resource_tables.values())) - self.tables_to_drop.reverse() - else: - self.tables_to_drop = [] - resource_names = list(resource_tables.keys()) - else: - self.tables_to_drop = [] - self.drop_tables = False # No tables to drop - self.drop_state = not not self.state_paths_to_drop # obtain truth value - - self.drop_all = drop_all - self.info: _DropInfo = dict( - tables=[t["name"] for t in self.tables_to_drop], - resource_states=[], - state_paths=[], - resource_names=resource_names, - schema_name=self.schema.name, - dataset_name=self.pipeline.dataset_name, - drop_all=drop_all, - resource_pattern=self.resource_pattern, - warnings=[], + + drop_result = drop_resources( + # self._drop_schema, self._new_state, self.info = drop_resources( + self.schema, + pipeline.state, + resources, + state_paths, + drop_all, + state_only, ) - if self.resource_pattern and not resource_tables: - self.info["warnings"].append( - f"Specified resource(s) {str(resources)} did not select any table(s) in schema" - f" {self.schema.name}. Possible resources are:" - f" {list(group_tables_by_resource(data_tables).keys())}" - ) - self._new_state = self._create_modified_state() + + self._new_state = drop_result.state + self.info = drop_result.info + self._new_schema = drop_result.schema + self._dropped_tables = drop_result.dropped_tables + self.drop_tables = not state_only and bool(self._dropped_tables) + + self.drop_state = bool(drop_all or resources or state_paths) @property def is_empty(self) -> bool: @@ -152,58 +111,6 @@ def is_empty(self) -> bool: and len(self.info["resource_states"]) == 0 ) - def _drop_destination_tables(self) -> None: - table_names = [tbl["name"] for tbl in self.tables_to_drop] - for table_name in table_names: - assert table_name not in self.schema._schema_tables, ( - f"You are dropping table {table_name} in {self.schema.name} but it is still present" - " in the schema" - ) - with self.pipeline._sql_job_client(self.schema) as client: - client.drop_tables(*table_names, replace_schema=True) - # also delete staging but ignore if staging does not exist - if isinstance(client, WithStagingDataset): - with contextlib.suppress(DatabaseUndefinedRelation): - with client.with_staging_dataset(): - client.drop_tables(*table_names, replace_schema=True) - - def _delete_schema_tables(self) -> None: - for tbl in self.tables_to_drop: - del self.schema_tables[tbl["name"]] - # bump schema, we'll save later - self.schema._bump_version() - - def _list_state_paths(self, source_state: Dict[str, Any]) -> List[str]: - return resolve_paths(self.state_paths_to_drop, source_state) - - def _create_modified_state(self) -> Dict[str, Any]: - state = self.pipeline.state - if not self.drop_state: - return state # type: ignore[return-value] - source_states = _sources_state(state).items() - for source_name, source_state in source_states: - # drop table states - if self.drop_state and self.resource_pattern: - for key in _get_matching_resources(self.resource_pattern, source_state): - self.info["resource_states"].append(key) - reset_resource_state(key, source_state) - # drop additional state paths - resolved_paths = resolve_paths(self.state_paths_to_drop, source_state) - if self.state_paths_to_drop and not resolved_paths: - self.info["warnings"].append( - f"State paths {self.state_paths_to_drop} did not select any paths in source" - f" {source_name}" - ) - _delete_source_state_keys(resolved_paths, source_state) - self.info["state_paths"].extend(f"{source_name}.{p}" for p in resolved_paths) - return state # type: ignore[return-value] - - def _extract_state(self) -> None: - state: Dict[str, Any] - with self.pipeline.managed_state(extract_state=True) as state: # type: ignore[assignment] - state.clear() - state.update(self._new_state) - def __call__(self) -> None: if ( self.pipeline.has_pending_data @@ -216,14 +123,16 @@ def __call__(self) -> None: if not self.drop_state and not self.drop_tables: return # Nothing to drop - if self.drop_tables: - self._delete_schema_tables() - self._drop_destination_tables() - if self.drop_tables: - self.pipeline.schemas.save_schema(self.schema) - if self.drop_state: - self._extract_state() - # Send updated state to destination + self._new_schema._bump_version() + new_state = deepcopy(self._new_state) + force_state_extract(new_state) + + self.pipeline._save_and_extract_state_and_schema( + new_state, + schema=self._new_schema, + load_package_state_update={"dropped_tables": self._dropped_tables}, + ) + self.pipeline.normalize() try: self.pipeline.load(raise_on_failed_jobs=True) @@ -232,11 +141,13 @@ def __call__(self) -> None: self.pipeline.drop_pending_packages() with self.pipeline.managed_state() as state: force_state_extract(state) + # Restore original schema file so all tables are known on next run + self.pipeline.schemas.save_schema(self.schema) raise def drop( - pipeline: Pipeline, + pipeline: "Pipeline", resources: Union[Iterable[str], str] = (), schema_name: str = None, state_paths: TAnyJsonPath = (), @@ -244,3 +155,34 @@ def drop( state_only: bool = False, ) -> None: return DropCommand(pipeline, resources, schema_name, state_paths, drop_all, state_only)() + + +def refresh_source( + pipeline: "Pipeline", source: DltSource, refresh: TRefreshMode +) -> Dict[str, Any]: + """Run the pipeline's refresh mode on the given source, updating the source's schema and state. + + Returns: + The new load package state containing tables that need to be dropped/truncated. + """ + if pipeline.first_run: + return {} + pipeline_state, _ = current_pipeline_state(pipeline._container) + _resources_to_drop = list(source.resources.extracted) if refresh != "drop_sources" else [] + drop_result = drop_resources( + source.schema, + pipeline_state, + resources=_resources_to_drop, + drop_all=refresh == "drop_sources", + state_paths="*" if refresh == "drop_sources" else [], + sources=source.name, + ) + load_package_state = {} + if drop_result.dropped_tables: + key = "dropped_tables" if refresh != "drop_data" else "truncated_tables" + load_package_state[key] = drop_result.dropped_tables + if refresh != "drop_data": # drop_data is only data wipe, keep original schema + source.schema = drop_result.schema + if "sources" in drop_result.state: + pipeline_state["sources"] = drop_result.state["sources"] + return load_package_state diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 53770f332d..81b50a8326 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -16,6 +16,7 @@ cast, get_type_hints, ContextManager, + Dict, ) from dlt import version @@ -45,6 +46,7 @@ TWriteDispositionConfig, TAnySchemaColumns, TSchemaContract, + TTableSchema, ) from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.exceptions import LoadPackageNotFound @@ -95,6 +97,7 @@ StateInjectableContext, TStepMetrics, WithStepInfo, + TRefreshMode, ) from dlt.common.schema import Schema from dlt.common.utils import is_interactive @@ -122,6 +125,7 @@ PipelineStepFailed, SqlClientNotAvailable, FSClientNotAvailable, + PipelineNeverRan, ) from dlt.pipeline.trace import ( PipelineTrace, @@ -133,6 +137,7 @@ end_trace_step, end_trace, ) +from dlt.common.pipeline import pipeline_state as current_pipeline_state from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.state_sync import ( PIPELINE_STATE_ENGINE_VERSION, @@ -145,6 +150,7 @@ ) from dlt.pipeline.warnings import credentials_argument_deprecated from dlt.common.storages.load_package import TLoadPackageState +from dlt.pipeline.helpers import refresh_source def with_state_sync(may_extract_state: bool = False) -> Callable[[TFun], TFun]: @@ -293,7 +299,7 @@ class Pipeline(SupportsPipeline): schema_names: List[str] = [] first_run: bool = False """Indicates a first run of the pipeline, where run ends with successful loading of the data""" - full_refresh: bool + dev_mode: bool must_attach_to_local_pipeline: bool pipelines_dir: str """A directory where the pipelines' working directories are created""" @@ -310,6 +316,7 @@ class Pipeline(SupportsPipeline): collector: _Collector config: PipelineConfiguration runtime_config: RunConfiguration + refresh: Optional[TRefreshMode] = None def __init__( self, @@ -322,20 +329,22 @@ def __init__( credentials: Any, import_schema_path: str, export_schema_path: str, - full_refresh: bool, + dev_mode: bool, progress: _Collector, must_attach_to_local_pipeline: bool, config: PipelineConfiguration, runtime: RunConfiguration, + refresh: Optional[TRefreshMode] = None, ) -> None: """Initializes the Pipeline class which implements `dlt` pipeline. Please use `pipeline` function in `dlt` module to create a new Pipeline instance.""" self.pipeline_salt = pipeline_salt self.config = config self.runtime_config = runtime - self.full_refresh = full_refresh + self.dev_mode = dev_mode self.collector = progress or _NULL_COLLECTOR self.destination = None self.staging = None + self.refresh = refresh self._container = Container() self._pipeline_instance_id = self._create_pipeline_instance_id() @@ -379,7 +388,7 @@ def drop(self, pipeline_name: str = None) -> "Pipeline": self.credentials, self._schema_storage.config.import_schema_path, self._schema_storage.config.export_schema_path, - self.full_refresh, + self.dev_mode, self.collector, False, self.config, @@ -403,8 +412,10 @@ def extract( max_parallel_items: int = None, workers: int = None, schema_contract: TSchemaContract = None, + refresh: Optional[TRefreshMode] = None, ) -> ExtractInfo: """Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description.""" + # create extract storage to which all the sources will be extracted extract_step = Extract( self._schema_storage, @@ -428,7 +439,14 @@ def extract( ): if source.exhausted: raise SourceExhausted(source.name) - self._extract_source(extract_step, source, max_parallel_items, workers) + + self._extract_source( + extract_step, + source, + max_parallel_items, + workers, + refresh=refresh or self.refresh, + ) # extract state state: TPipelineStateDoc = None if self.config.restore_from_destination: @@ -580,6 +598,7 @@ def run( schema: Schema = None, loader_file_format: TLoaderFileFormat = None, schema_contract: TSchemaContract = None, + refresh: Optional[TRefreshMode] = None, ) -> LoadInfo: """Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -633,6 +652,11 @@ def run( schema_contract (TSchemaContract, optional): On override for the schema contract settings, this will replace the schema contract settings for all tables in the schema. Defaults to None. + refresh (str | TRefreshMode): Fully or partially reset sources before loading new data in this run. The following refresh modes are supported: + * `drop_sources`: Drop tables and source and resource state for all sources currently being processed in `run` or `extract` methods of the pipeline. (Note: schema history is erased) + * `drop_resources`: Drop tables and resource state for all resources being processed. Source level state is not modified. (Note: schema history is erased) + * `drop_data`: Wipe all data and resource state for all resources being processed. Schema is not modified. + Raises: PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. Returns: @@ -648,7 +672,7 @@ def run( # sync state with destination if ( self.config.restore_from_destination - and not self.full_refresh + and not self.dev_mode and not self._state_restored and (self.destination or destination) ): @@ -679,6 +703,7 @@ def run( primary_key=primary_key, schema=schema, schema_contract=schema_contract, + refresh=refresh or self.refresh, ) self.normalize(loader_file_format=loader_file_format) return self.load(destination, dataset_name, credentials=credentials) @@ -1031,7 +1056,7 @@ def _init_working_dir(self, pipeline_name: str, pipelines_dir: str) -> None: # create pipeline storage, do not create working dir yet self._pipeline_storage = FileStorage(self.working_dir, makedirs=False) # if full refresh was requested, wipe out all data from working folder, if exists - if self.full_refresh: + if self.dev_mode: self._wipe_working_folder() def _configure( @@ -1083,7 +1108,13 @@ def _attach_pipeline(self) -> None: pass def _extract_source( - self, extract: Extract, source: DltSource, max_parallel_items: int, workers: int + self, + extract: Extract, + source: DltSource, + max_parallel_items: int, + workers: int, + refresh: Optional[TRefreshMode] = None, + load_package_state_update: Optional[Dict[str, Any]] = None, ) -> str: # discover the existing pipeline schema try: @@ -1102,8 +1133,14 @@ def _extract_source( except FileNotFoundError: pass + load_package_state_update = dict(load_package_state_update or {}) + if refresh: + load_package_state_update.update(refresh_source(self, source, refresh)) + # extract into pipeline schema - load_id = extract.extract(source, max_parallel_items, workers) + load_id = extract.extract( + source, max_parallel_items, workers, load_package_state_update=load_package_state_update + ) # save import with fully discovered schema # NOTE: moved to with_schema_sync, remove this if all test pass @@ -1145,9 +1182,9 @@ def _get_destination_client_initial_config( # this client support many schemas and datasets if issubclass(client_spec, DestinationClientDwhConfiguration): - if not self.dataset_name and self.full_refresh: + if not self.dataset_name and self.dev_mode: logger.warning( - "Full refresh may not work if dataset name is not set. Please set the" + "Dev mode may not work if dataset name is not set. Please set the" " dataset_name argument in dlt.pipeline or run method" ) # set default schema name to load all incoming data to a single dataset, no matter what is the current schema name @@ -1335,8 +1372,8 @@ def _set_dataset_name(self, new_dataset_name: str) -> None: if not new_dataset_name: return - # in case of full refresh add unique suffix - if self.full_refresh: + # in case of dev_mode add unique suffix + if self.dev_mode: # dataset must be specified # double _ is not allowed if new_dataset_name.endswith("_"): @@ -1532,8 +1569,37 @@ def _props_to_state(self, state: TPipelineState) -> TPipelineState: state["schema_names"] = self._list_schemas_sorted() return state + def _save_and_extract_state_and_schema( + self, + state: TPipelineState, + schema: Schema, + load_package_state_update: Optional[Dict[str, Any]] = None, + ) -> None: + """Save given state + schema and extract creating a new load package + + Args: + state: The new pipeline state, replaces the current state + schema: The new source schema, replaces current schema of the same name + load_package_state_update: Dict which items will be included in the load package state + """ + self.schemas.save_schema(schema) + with self.managed_state() as old_state: + old_state.update(state) + + self._bump_version_and_extract_state( + state, + extract_state=True, + load_package_state_update=load_package_state_update, + schema=schema, + ) + def _bump_version_and_extract_state( - self, state: TPipelineState, extract_state: bool, extract: Extract = None + self, + state: TPipelineState, + extract_state: bool, + extract: Extract = None, + load_package_state_update: Optional[Dict[str, Any]] = None, + schema: Optional[Schema] = None, ) -> TPipelineStateDoc: """Merges existing state into `state` and extracts state using `storage` if extract_state is True. @@ -1547,7 +1613,11 @@ def _bump_version_and_extract_state( self._schema_storage, self._normalize_storage_config(), original_data=data ) self._extract_source( - extract_, data_to_sources(data, self, self.default_schema)[0], 1, 1 + extract_, + data_to_sources(data, self, schema or self.default_schema)[0], + 1, + 1, + load_package_state_update=load_package_state_update, ) # set state to be extracted mark_state_extracted(state, hash_) diff --git a/dlt/pipeline/warnings.py b/dlt/pipeline/warnings.py index 87fcbc1f0c..8bee670cb7 100644 --- a/dlt/pipeline/warnings.py +++ b/dlt/pipeline/warnings.py @@ -20,3 +20,16 @@ def credentials_argument_deprecated( Dlt04DeprecationWarning, stacklevel=2, ) + + +def full_refresh_argument_deprecated(caller_name: str, full_refresh: t.Optional[bool]) -> None: + """full_refresh argument is replaced with dev_mode""" + if full_refresh is None: + return + + warnings.warn( + f"The `full_refresh` argument to {caller_name} is deprecated and will be removed in a" + f" future version. Use `dev_mode={full_refresh}` instead which will have the same effect.", + Dlt04DeprecationWarning, + stacklevel=2, + ) diff --git a/docs/examples/archive/google_sheets.py b/docs/examples/archive/google_sheets.py index 26c3d30b54..61b9859c53 100644 --- a/docs/examples/archive/google_sheets.py +++ b/docs/examples/archive/google_sheets.py @@ -2,7 +2,7 @@ from sources.google_sheets import google_spreadsheet -dlt.pipeline(destination="bigquery", full_refresh=False) +dlt.pipeline(destination="bigquery", dev_mode=False) # see example.secrets.toml to where to put credentials # "2022-05", "model_metadata" diff --git a/docs/examples/archive/quickstart.py b/docs/examples/archive/quickstart.py index 6e49f1af7a..f435fa3fab 100644 --- a/docs/examples/archive/quickstart.py +++ b/docs/examples/archive/quickstart.py @@ -48,7 +48,7 @@ dataset_name=dataset_name, credentials=credentials, export_schema_path=export_schema_path, - full_refresh=True, + dev_mode=True, ) diff --git a/docs/examples/archive/rasa_example.py b/docs/examples/archive/rasa_example.py index e83e6c61f7..76e3e9c011 100644 --- a/docs/examples/archive/rasa_example.py +++ b/docs/examples/archive/rasa_example.py @@ -20,7 +20,7 @@ event_files = jsonl_files([file for file in os.scandir("docs/examples/data/rasa_trackers")]) info = dlt.pipeline( - full_refresh=True, + dev_mode=True, destination=postgres, # export_schema_path=... # uncomment to see the final schema in the folder you want ).run( diff --git a/docs/examples/archive/singer_tap_jsonl_example.py b/docs/examples/archive/singer_tap_jsonl_example.py index c926a9f153..109dd05b3f 100644 --- a/docs/examples/archive/singer_tap_jsonl_example.py +++ b/docs/examples/archive/singer_tap_jsonl_example.py @@ -9,7 +9,7 @@ # load hubspot schema stub - it converts all field names with `timestamp` into timestamp type schema = SchemaStorage.load_schema_file("docs/examples/schemas/", "hubspot", ("yaml",)) -p = dlt.pipeline(destination="postgres", full_refresh=True) +p = dlt.pipeline(destination="postgres", dev_mode=True) # now load a pipeline created from jsonl resource that feeds messages into singer tap transformer pipe = jsonl_file("docs/examples/data/singer_taps/tap_hubspot.jsonl") | singer_raw_stream() # provide hubspot schema diff --git a/docs/examples/chess/chess.py b/docs/examples/chess/chess.py index df1fb18845..7b577c2646 100644 --- a/docs/examples/chess/chess.py +++ b/docs/examples/chess/chess.py @@ -50,9 +50,9 @@ def players_games(username: Any) -> Iterator[TDataItems]: print("You must run this from the docs/examples/chess folder") # chess_url in config.toml, credentials for postgres in secrets.toml, credentials always under credentials key # look for parallel run configuration in `config.toml`! - # mind the full_refresh: it makes the pipeline to load to a distinct dataset each time it is run and always is resetting the schema and state + # mind the dev_mode: it makes the pipeline to load to a distinct dataset each time it is run and always is resetting the schema and state load_info = dlt.pipeline( - pipeline_name="chess_games", destination="postgres", dataset_name="chess", full_refresh=True + pipeline_name="chess_games", destination="postgres", dataset_name="chess", dev_mode=True ).run(chess(max_players=5, month=9)) # display where the data went print(load_info) diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index 88a64b46a0..a7cd4e4050 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -329,7 +329,7 @@ pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", dataset_name="github_reactions", - full_refresh=True + dev_mode=True ) with pipeline.sql_client() as client: diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index c2f6786f8d..d6ec36ae49 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -90,7 +90,7 @@ p = dlt.pipeline( pipeline_name='chess', destination=dlt.destinations.duckdb("files/data.db"), dataset_name='chess_data', - full_refresh=False + dev_mode=False ) # will load data to /var/local/database.duckdb (absolute path) @@ -98,7 +98,7 @@ p = dlt.pipeline( pipeline_name='chess', destination=dlt.destinations.duckdb("/var/local/database.duckdb"), dataset_name='chess_data', - full_refresh=False + dev_mode=False ) ``` @@ -112,7 +112,7 @@ p = dlt.pipeline( pipeline_name="chess", destination=dlt.destinations.duckdb(db), dataset_name="chess_data", - full_refresh=False, + dev_mode=False, ) # Or if you would like to use in-memory duckdb instance @@ -183,4 +183,3 @@ This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-d This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). - diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md index 5a82d8be66..0e08666eaf 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ b/docs/website/docs/dlt-ecosystem/transformations/pandas.md @@ -16,7 +16,7 @@ pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", dataset_name="github_reactions", - full_refresh=True + dev_mode=True ) with pipeline.sql_client() as client: with client.execute_query( diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md index 70dcc979f3..f00e185480 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/salesforce.md @@ -262,7 +262,7 @@ To create your data pipeline using single loading and > For incremental loading of endpoints, maintain the pipeline name and destination dataset name. > The pipeline name is important for accessing the [state](../../general-usage/state) from the > last run, including the end date for incremental data loads. Altering these names could trigger - > a [“full_refresh”](../../general-usage/pipeline#do-experiments-with-full-refresh), disrupting + > a [“dev-mode”](../../general-usage/pipeline#do-experiments-with-dev-mode), disrupting > the metadata tracking for [incremental data loading](../../general-usage/incremental-loading). 1. To load data from the “contact” in replace mode and “task” incrementally merge mode endpoints: diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md index b8993ae8d5..cfccf5d675 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/zendesk.md @@ -330,10 +330,10 @@ verified source. ```py pipeline = dlt.pipeline( - pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired - destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) - full_refresh = False, - dataset_name="sample_zendesk_data" # Use a custom name if desired + pipeline_name="dlt_zendesk_pipeline", # Use a custom name if desired + destination="duckdb", # Choose the appropriate destination (e.g., duckdb, redshift, post) + dev_mode = False, + dataset_name="sample_zendesk_data" # Use a custom name if desired ) data = zendesk_support(load_all=True, start_date=start_date) data_chat = zendesk_chat(start_date=start_date) diff --git a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md index ffe0abd082..d9aae62f94 100644 --- a/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md +++ b/docs/website/docs/dlt-ecosystem/visualizations/exploring-the-data.md @@ -59,7 +59,7 @@ pipeline = dlt.pipeline( pipeline_name="github_pipeline", destination="duckdb", dataset_name="github_reactions", - full_refresh=True + dev_mode=True ) with pipeline.sql_client() as client: with client.execute_query( diff --git a/docs/website/docs/general-usage/destination-tables.md b/docs/website/docs/general-usage/destination-tables.md index 4780d4be20..b53d864a96 100644 --- a/docs/website/docs/general-usage/destination-tables.md +++ b/docs/website/docs/general-usage/destination-tables.md @@ -276,12 +276,12 @@ Notice that the `mydata.users` table now contains the data from both the previou ## Versioned datasets -When you set the `full_refresh` argument to `True` in `dlt.pipeline` call, dlt creates a versioned dataset. +When you set the `dev_mode` argument to `True` in `dlt.pipeline` call, dlt creates a versioned dataset. This means that each time you run the pipeline, the data is loaded into a new dataset (a new database schema). The dataset name is the same as the `dataset_name` you provided in the pipeline definition with a datetime-based suffix. -We modify our pipeline to use the `full_refresh` option to see how this works: +We modify our pipeline to use the `dev_mode` option to see how this works: ```py import dlt @@ -295,7 +295,7 @@ pipeline = dlt.pipeline( pipeline_name='quick_start', destination='duckdb', dataset_name='mydata', - full_refresh=True # <-- add this line + dev_mode=True # <-- add this line ) load_info = pipeline.run(data, table_name="users") ``` diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 18bdb13b06..c2c951c9b0 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -223,7 +223,7 @@ pipeline = dlt.pipeline( pipeline_name='facebook_insights', destination='duckdb', dataset_name='facebook_insights_data', - full_refresh=True + dev_mode=True ) fb_ads = facebook_ads_source() # enable root key propagation on a source that is not a merge one by default. diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index 53eca2e59a..d1f82f970a 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -1,7 +1,7 @@ --- title: Pipeline description: Explanation of what a dlt pipeline is -keywords: [pipeline, source, full refresh] +keywords: [pipeline, source, full refresh, dev mode] --- # Pipeline @@ -85,13 +85,47 @@ You can inspect stored artifacts using the command > 💡 You can attach `Pipeline` instance to an existing working folder, without creating a new > pipeline with `dlt.attach`. -## Do experiments with full refresh +## Do experiments with dev mode If you [create a new pipeline script](../walkthroughs/create-a-pipeline.md) you will be experimenting a lot. If you want that each time the pipeline resets its state and loads data to a -new dataset, set the `full_refresh` argument of the `dlt.pipeline` method to True. Each time the +new dataset, set the `dev_mode` argument of the `dlt.pipeline` method to True. Each time the pipeline is created, `dlt` adds datetime-based suffix to the dataset name. +## Refresh pipeline data and state + +You can reset parts or all of your sources by using the `refresh` argument to `dlt.pipeline` or the pipeline's `run` or `extract` method. +That means when you run the pipeline the sources/resources being processed will have their state reset and their tables either dropped or truncated +depending on which refresh mode is used. + +The `refresh` argument should have one of the following string values to decide the refresh mode: + +* `drop_sources` + All sources being processed in `pipeline.run` or `pipeline.extract` are refreshed. + That means all tables listed in their schemas are dropped and state belonging to those sources and all their resources is completely wiped. + The tables are deleted both from pipeline's schema and from the destination database. + + If you only have one source or run with all your sources together, then this is practically like running the pipeline again for the first time + + :::caution + This erases schema history for the selected sources and only the latest version is stored + :::: + +* `drop_resources` + Limits the refresh to the resources being processed in `pipeline.run` or `pipeline.extract` (.e.g by using `source.with_resources(...)`). + Tables belonging to those resources are dropped and their resource state is wiped (that includes incremental state). + The tables are deleted both from pipeline's schema and from the destination database. + + Source level state keys are not deleted in this mode (i.e. `dlt.state()[<'my_key>'] = ''`) + + :::caution + This erases schema history for all affected schemas and only the latest schema version is stored + :::: + +* `drop_data` + Same as `drop_resources` but instead of dropping tables from schema only the data is deleted from them (i.e. by `TRUNCATE ` in sql destinations). Resource state for selected resources is also wiped. + The schema remains unmodified in this case. + ## Display the loading progress You can add a progress monitor to the pipeline. Typically, its role is to visually assure user that diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index 0ab2b8a658..4a9e453ea4 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -125,7 +125,7 @@ will display source and resource state slots for all known sources. **To fully reset the state:** - Drop the destination dataset to fully reset the pipeline. -- [Set the `full_refresh` flag when creating pipeline](pipeline.md#do-experiments-with-full-refresh). +- [Set the `dev_mode` flag when creating pipeline](pipeline.md#do-experiments-with-dev-mode). - Use the `dlt pipeline drop --drop-all` command to [drop state and tables for a given schema name](../reference/command-line-interface.md#selectively-drop-tables-and-reset-state). diff --git a/docs/website/docs/reference/performance_snippets/performance-snippets.py b/docs/website/docs/reference/performance_snippets/performance-snippets.py index 68ec8ed72d..7fc0f2bce9 100644 --- a/docs/website/docs/reference/performance_snippets/performance-snippets.py +++ b/docs/website/docs/reference/performance_snippets/performance-snippets.py @@ -20,7 +20,7 @@ def read_table(limit): # this prevents process pool to run the initialization code again if __name__ == "__main__" or "PYTEST_CURRENT_TEST" in os.environ: - pipeline = dlt.pipeline("parallel_load", destination="duckdb", full_refresh=True) + pipeline = dlt.pipeline("parallel_load", destination="duckdb", dev_mode=True) pipeline.extract(read_table(1000000)) load_id = pipeline.list_extracted_load_packages()[0] @@ -168,8 +168,8 @@ def _run_pipeline(pipeline, gen_): return pipeline.run(gen_()) # declare pipelines in main thread then run them "async" - pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) - pipeline_2 = dlt.pipeline("pipeline_2", destination="duckdb", full_refresh=True) + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) + pipeline_2 = dlt.pipeline("pipeline_2", destination="duckdb", dev_mode=True) async def _run_async(): loop = asyncio.get_running_loop() diff --git a/docs/website/docs/walkthroughs/adjust-a-schema.md b/docs/website/docs/walkthroughs/adjust-a-schema.md index b0a9a9ce05..b92f431f80 100644 --- a/docs/website/docs/walkthroughs/adjust-a-schema.md +++ b/docs/website/docs/walkthroughs/adjust-a-schema.md @@ -71,13 +71,13 @@ You should keep the import schema as simple as possible and let `dlt` do the res automatically on the next run. It means that after a user update, the schema in `import` folder reverts all the automatic updates from the data. -In next steps we'll experiment a lot, you will be warned to set `full_refresh=True` until we are done experimenting. +In next steps we'll experiment a lot, you will be warned to set `dev_mode=True` until we are done experimenting. :::caution `dlt` will **not modify** tables after they are created. So if you have a `yaml` file, and you change it (e.g. change a data type or add a hint), then you need to **delete the dataset** -or set `full_refresh=True`: +or set `dev_mode=True`: ```py dlt.pipeline( import_schema_path="schemas/import", @@ -85,7 +85,7 @@ dlt.pipeline( pipeline_name="chess_pipeline", destination='duckdb', dataset_name="games_data", - full_refresh=True, + dev_mode=True, ) ``` ::: diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md index 329f484874..ce76240c8a 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md @@ -151,7 +151,7 @@ def load_data(): pipeline_name='pipeline_name', dataset_name='dataset_name', destination='duckdb', - full_refresh=False # must be false if we decompose + dev_mode=False # must be false if we decompose ) # Create the source, the "serialize" decompose option # will convert dlt resources into Airflow tasks. diff --git a/tests/cli/cases/deploy_pipeline/debug_pipeline.py b/tests/cli/cases/deploy_pipeline/debug_pipeline.py index c49e8b524d..1f5bfad976 100644 --- a/tests/cli/cases/deploy_pipeline/debug_pipeline.py +++ b/tests/cli/cases/deploy_pipeline/debug_pipeline.py @@ -17,7 +17,7 @@ def example_source(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id= pipeline_name="debug_pipeline", destination="postgres", dataset_name="debug_pipeline_data", - full_refresh=False, + dev_mode=False, ) load_info = p.run(example_source(last_id=819273998)) print(load_info) diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index 43ccdf856c..945856e93f 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -53,6 +53,7 @@ add_config_dict_to_env, add_config_to_env, ) +from dlt.common.pipeline import TRefreshMode from tests.utils import preserve_environ from tests.common.configuration.utils import ( @@ -240,6 +241,11 @@ def resolve_dynamic_type_field(self) -> Type[Union[int, str]]: return str +@configspec +class ConfigWithLiteralField(BaseConfiguration): + refresh: TRefreshMode = None + + LongInteger = NewType("LongInteger", int) FirstOrderStr = NewType("FirstOrderStr", str) SecondOrderStr = NewType("SecondOrderStr", FirstOrderStr) @@ -1310,3 +1316,20 @@ class EmbeddedConfigurationWithDefaults(BaseConfiguration): c_resolved = resolve.resolve_configuration(c_instance) assert c_resolved.is_resolved() assert c_resolved.conn_str.is_resolved() + + +def test_configuration_with_literal_field(environment: Dict[str, str]) -> None: + """Literal type fields only allow values from the literal""" + environment["REFRESH"] = "not_a_refresh_mode" + + with pytest.raises(ConfigValueCannotBeCoercedException) as einfo: + resolve.resolve_configuration(ConfigWithLiteralField()) + + assert einfo.value.field_name == "refresh" + assert einfo.value.field_value == "not_a_refresh_mode" + assert einfo.value.hint == TRefreshMode + + environment["REFRESH"] = "drop_data" + + spec = resolve.resolve_configuration(ConfigWithLiteralField()) + assert spec.refresh == "drop_data" diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 308b65bd37..125c699c90 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -296,7 +296,7 @@ def some_data(param: str): # create two resource instances and extract in single ad hoc resource data1 = some_data("state1") data1._pipe.name = "state1_data" - dlt.pipeline(full_refresh=True).extract([data1, some_data("state2")], schema=Schema("default")) + dlt.pipeline(dev_mode=True).extract([data1, some_data("state2")], schema=Schema("default")) # both should be extracted. what we test here is the combination of binding the resource by calling it that clones the internal pipe # and then creating a source with both clones. if we keep same pipe id when cloning on call, a single pipe would be created shared by two resources assert all_yields == ["state1", "state2"] @@ -738,7 +738,7 @@ def test_source(no_resources): def test_source_resource_attrs_with_conflicting_attrs() -> None: """Resource names that conflict with DltSource attributes do not work with attribute access""" - dlt.pipeline(full_refresh=True) # Create pipeline so state property can be accessed + dlt.pipeline(dev_mode=True) # Create pipeline so state property can be accessed names = ["state", "resources", "schema", "name", "clone"] @dlt.source @@ -842,7 +842,7 @@ def test_source(expected_state): with pytest.raises(PipelineStateNotAvailable): test_source({}).state - dlt.pipeline(full_refresh=True) + dlt.pipeline(dev_mode=True) assert test_source({}).state == {} # inject state to see if what we write in state is there @@ -872,7 +872,7 @@ def test_source(): with pytest.raises(PipelineStateNotAvailable): s.test_resource.state - p = dlt.pipeline(full_refresh=True) + p = dlt.pipeline(dev_mode=True) assert r.state == {} assert s.state == {} assert s.test_resource.state == {} diff --git a/tests/helpers/dbt_tests/local/test_runner_destinations.py b/tests/helpers/dbt_tests/local/test_runner_destinations.py index c9e4b7c83b..244f06e9ce 100644 --- a/tests/helpers/dbt_tests/local/test_runner_destinations.py +++ b/tests/helpers/dbt_tests/local/test_runner_destinations.py @@ -99,7 +99,7 @@ def test_dbt_test_no_raw_schema(destination_info: DBTDestinationInfo) -> None: assert isinstance(prq_ex.value.args[0], DBTProcessingError) -def test_dbt_run_full_refresh(destination_info: DBTDestinationInfo) -> None: +def test_dbt_run_dev_mode(destination_info: DBTDestinationInfo) -> None: if destination_info.destination_name == "redshift": pytest.skip("redshift disabled due to missing fixtures") runner = setup_rasa_runner(destination_info.destination_name) diff --git a/tests/load/athena_iceberg/test_athena_iceberg.py b/tests/load/athena_iceberg/test_athena_iceberg.py index d3bb9eb5f5..4fe01752ee 100644 --- a/tests/load/athena_iceberg/test_athena_iceberg.py +++ b/tests/load/athena_iceberg/test_athena_iceberg.py @@ -28,7 +28,7 @@ def test_iceberg() -> None: pipeline_name="athena-iceberg", destination="athena", staging="filesystem", - full_refresh=True, + dev_mode=True, ) def items() -> Iterator[Any]: diff --git a/tests/load/bigquery/test_bigquery_table_builder.py b/tests/load/bigquery/test_bigquery_table_builder.py index 2db90200ec..df564192dc 100644 --- a/tests/load/bigquery/test_bigquery_table_builder.py +++ b/tests/load/bigquery/test_bigquery_table_builder.py @@ -200,7 +200,7 @@ def test_create_table_with_integer_partition(gcp_client: BigQueryClient) -> None def test_bigquery_partition_by_date( destination_config: DestinationTestConfiguration, ) -> None: - pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", full_refresh=True) + pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", dev_mode=True) @dlt.resource( write_disposition="merge", @@ -243,7 +243,7 @@ def demo_source() -> DltResource: def test_bigquery_no_partition_by_date( destination_config: DestinationTestConfiguration, ) -> None: - pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", full_refresh=True) + pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", dev_mode=True) @dlt.resource( write_disposition="merge", @@ -286,7 +286,7 @@ def demo_source() -> DltResource: def test_bigquery_partition_by_timestamp( destination_config: DestinationTestConfiguration, ) -> None: - pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", full_refresh=True) + pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", dev_mode=True) @dlt.resource( write_disposition="merge", @@ -329,7 +329,7 @@ def demo_source() -> DltResource: def test_bigquery_no_partition_by_timestamp( destination_config: DestinationTestConfiguration, ) -> None: - pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", full_refresh=True) + pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", dev_mode=True) @dlt.resource( write_disposition="merge", @@ -372,7 +372,7 @@ def demo_source() -> DltResource: def test_bigquery_partition_by_integer( destination_config: DestinationTestConfiguration, ) -> None: - pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", full_refresh=True) + pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", dev_mode=True) @dlt.resource( columns={"some_int": {"data_type": "bigint", "partition": True, "nullable": False}}, @@ -407,7 +407,7 @@ def demo_source() -> DltResource: def test_bigquery_no_partition_by_integer( destination_config: DestinationTestConfiguration, ) -> None: - pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", full_refresh=True) + pipeline = destination_config.setup_pipeline(f"bigquery_{uniq_id()}", dev_mode=True) @dlt.resource( columns={"some_int": {"data_type": "bigint", "partition": False, "nullable": False}}, @@ -510,7 +510,7 @@ def sources() -> List[DltResource]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(sources()) @@ -570,7 +570,7 @@ def sources() -> List[DltResource]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(sources()) @@ -632,7 +632,7 @@ def sources() -> List[DltResource]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(sources()) @@ -764,7 +764,7 @@ def sources() -> List[DltResource]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(sources()) @@ -814,7 +814,7 @@ def sources() -> List[DltResource]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(sources()) @@ -904,7 +904,7 @@ def sources() -> List[DltResource]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(sources()) @@ -1004,7 +1004,7 @@ def sources() -> List[DltResource]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(sources()) @@ -1049,7 +1049,7 @@ def hints() -> Iterator[Dict[str, Any]]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(hints) diff --git a/tests/load/pipeline/test_athena.py b/tests/load/pipeline/test_athena.py index a5bb6efc0d..272cc701d5 100644 --- a/tests/load/pipeline/test_athena.py +++ b/tests/load/pipeline/test_athena.py @@ -30,7 +30,7 @@ ids=lambda x: x.name, ) def test_athena_destinations(destination_config: DestinationTestConfiguration) -> None: - pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), dev_mode=True) @dlt.resource(name="items", write_disposition="append") def items(): @@ -88,7 +88,7 @@ def items2(): def test_athena_all_datatypes_and_timestamps( destination_config: DestinationTestConfiguration, ) -> None: - pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), dev_mode=True) # TIME is not supported column_schemas, data_types = table_update_and_row(exclude_types=["time"]) @@ -176,7 +176,7 @@ def my_source() -> Any: ids=lambda x: x.name, ) def test_athena_blocks_time_column(destination_config: DestinationTestConfiguration) -> None: - pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), dev_mode=True) column_schemas, data_types = table_update_and_row() diff --git a/tests/load/pipeline/test_dbt_helper.py b/tests/load/pipeline/test_dbt_helper.py index 38e66c4ab9..6793414e3c 100644 --- a/tests/load/pipeline/test_dbt_helper.py +++ b/tests/load/pipeline/test_dbt_helper.py @@ -39,7 +39,7 @@ def test_run_jaffle_package( pytest.skip( "dbt-athena requires database to be created and we don't do it in case of Jaffle" ) - pipeline = destination_config.setup_pipeline("jaffle_jaffle", full_refresh=True) + pipeline = destination_config.setup_pipeline("jaffle_jaffle", dev_mode=True) # get runner, pass the env from fixture dbt = dlt.dbt.package(pipeline, "https://github.com/dbt-labs/jaffle_shop.git", venv=dbt_venv) # no default schema @@ -76,7 +76,7 @@ def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_ven os.environ["CHESS_URL"] = "https://api.chess.com/pub/" pipeline = destination_config.setup_pipeline( - "chess_games", dataset_name="chess_dbt_test", full_refresh=True + "chess_games", dataset_name="chess_dbt_test", dev_mode=True ) assert pipeline.default_schema_name is None # get the runner for the "dbt_transform" package @@ -129,7 +129,7 @@ def test_run_chess_dbt_to_other_dataset( os.environ["CHESS_URL"] = "https://api.chess.com/pub/" pipeline = destination_config.setup_pipeline( - "chess_games", dataset_name="chess_dbt_test", full_refresh=True + "chess_games", dataset_name="chess_dbt_test", dev_mode=True ) # load each schema in separate dataset pipeline.config.use_single_dataset = False diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index afae1c22ca..313ba63a2c 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -130,7 +130,7 @@ def test_drop_command_resources_and_state(destination_config: DestinationTestCon """Test the drop command with resource and state path options and verify correct data is deleted from destination and locally""" source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) attached = _attach(pipeline) @@ -155,7 +155,7 @@ def test_drop_command_resources_and_state(destination_config: DestinationTestCon def test_drop_command_only_state(destination_config: DestinationTestConfiguration) -> None: """Test drop command that deletes part of the state and syncs with destination""" source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) attached = _attach(pipeline) @@ -198,22 +198,24 @@ def test_drop_command_only_tables(destination_config: DestinationTestConfigurati "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_drop_destination_tables_fails(destination_config: DestinationTestConfiguration) -> None: - """Fail on drop tables. Command runs again.""" + """Fail on DROP TABLES in destination init. Command runs again.""" source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) attached = _attach(pipeline) with mock.patch.object( - helpers.DropCommand, - "_drop_destination_tables", - side_effect=RuntimeError("Something went wrong"), + pipeline.destination.client_class, + "drop_tables", + autospec=True, + side_effect=RuntimeError("Oh no!"), ): - with pytest.raises(RuntimeError): + with pytest.raises(PipelineStepFailed) as einfo: helpers.drop(attached, resources=("droppable_a", "droppable_b")) + assert isinstance(einfo.value.exception, RuntimeError) + assert "Oh no!" in str(einfo.value.exception) - attached = _attach(pipeline) helpers.drop(attached, resources=("droppable_a", "droppable_b")) assert_dropped_resources(attached, ["droppable_a", "droppable_b"]) @@ -226,17 +228,24 @@ def test_drop_destination_tables_fails(destination_config: DestinationTestConfig def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration) -> None: """Fail directly after drop tables. Command runs again ignoring destination tables missing.""" source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) attached = _attach(pipeline) + # Fail on client update_stored_schema with mock.patch.object( - helpers.DropCommand, "_extract_state", side_effect=RuntimeError("Something went wrong") + pipeline.destination.client_class, + "update_stored_schema", + autospec=True, + side_effect=RuntimeError("Oh no!"), ): - with pytest.raises(RuntimeError): + with pytest.raises(PipelineStepFailed) as einfo: helpers.drop(attached, resources=("droppable_a", "droppable_b")) + assert isinstance(einfo.value.exception, RuntimeError) + assert "Oh no!" in str(einfo.value.exception) + attached = _attach(pipeline) helpers.drop(attached, resources=("droppable_a", "droppable_b")) @@ -250,7 +259,7 @@ def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration def test_load_step_fails(destination_config: DestinationTestConfiguration) -> None: """Test idempotence. pipeline.load() fails. Command can be run again successfully""" source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) attached = _attach(pipeline) @@ -272,7 +281,7 @@ def test_load_step_fails(destination_config: DestinationTestConfiguration) -> No ) def test_resource_regex(destination_config: DestinationTestConfiguration) -> None: source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) attached = _attach(pipeline) @@ -291,7 +300,7 @@ def test_resource_regex(destination_config: DestinationTestConfiguration) -> Non def test_drop_nothing(destination_config: DestinationTestConfiguration) -> None: """No resources, no state keys. Nothing is changed.""" source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) attached = _attach(pipeline) @@ -309,7 +318,7 @@ def test_drop_nothing(destination_config: DestinationTestConfiguration) -> None: def test_drop_all_flag(destination_config: DestinationTestConfiguration) -> None: """Using drop_all flag. Destination dataset and all local state is deleted""" source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) dlt_tables = [ t["name"] for t in pipeline.default_schema.dlt_tables() @@ -335,7 +344,7 @@ def test_drop_all_flag(destination_config: DestinationTestConfiguration) -> None ) def test_run_pipeline_after_partial_drop(destination_config: DestinationTestConfiguration) -> None: """Pipeline can be run again after dropping some resources""" - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(droppable_source()) attached = _attach(pipeline) @@ -354,7 +363,7 @@ def test_run_pipeline_after_partial_drop(destination_config: DestinationTestConf ) def test_drop_state_only(destination_config: DestinationTestConfiguration) -> None: """Pipeline can be run again after dropping some resources""" - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(droppable_source()) attached = _attach(pipeline) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 5f24daf57f..623284d8a7 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -362,8 +362,9 @@ def _collect_table_counts(p) -> Dict[str, int]: ) # generate 4 loads from 2 pipelines, store load ids - p1 = destination_config.setup_pipeline("p1", dataset_name="layout_test") - p2 = destination_config.setup_pipeline("p2", dataset_name="layout_test") + dataset_name = "layout_test_" + uniq_id() + p1 = destination_config.setup_pipeline("p1", dataset_name=dataset_name) + p2 = destination_config.setup_pipeline("p2", dataset_name=dataset_name) c1 = cast(FilesystemClient, p1.destination_client()) c2 = cast(FilesystemClient, p2.destination_client()) diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index f4e039ee81..a3f5083ae6 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -31,7 +31,7 @@ "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_merge_on_keys_in_schema(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("eth_2", full_refresh=True) + p = destination_config.setup_pipeline("eth_2", dev_mode=True) with open("tests/common/cases/schemas/eth/ethereum_schema_v5.yml", "r", encoding="utf-8") as f: schema = dlt.Schema.from_dict(yaml.safe_load(f)) @@ -97,7 +97,7 @@ def test_merge_on_keys_in_schema(destination_config: DestinationTestConfiguratio "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_merge_on_ad_hoc_primary_key(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("github_1", full_refresh=True) + p = destination_config.setup_pipeline("github_1", dev_mode=True) with open( "tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8" @@ -162,7 +162,7 @@ def load_issues(): def test_merge_source_compound_keys_and_changes( destination_config: DestinationTestConfiguration, ) -> None: - p = destination_config.setup_pipeline("github_3", full_refresh=True) + p = destination_config.setup_pipeline("github_3", dev_mode=True) info = p.run(github(), loader_file_format=destination_config.file_format) assert_load_info(info) @@ -211,7 +211,7 @@ def test_merge_source_compound_keys_and_changes( "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_merge_no_child_tables(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("github_3", full_refresh=True) + p = destination_config.setup_pipeline("github_3", dev_mode=True) github_data = github() assert github_data.max_table_nesting is None assert github_data.root_key is True @@ -251,7 +251,7 @@ def test_merge_no_merge_keys(destination_config: DestinationTestConfiguration) - # NOTE: we can test filesystem destination merge behavior here too, will also fallback! if destination_config.file_format == "insert_values": pytest.skip("Insert values row count checking is buggy, skipping") - p = destination_config.setup_pipeline("github_3", full_refresh=True) + p = destination_config.setup_pipeline("github_3", dev_mode=True) github_data = github() # remove all keys github_data.load_issues.apply_hints(merge_key=(), primary_key=()) @@ -279,7 +279,7 @@ def test_merge_no_merge_keys(destination_config: DestinationTestConfiguration) - "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_merge_keys_non_existing_columns(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("github_3", full_refresh=True) + p = destination_config.setup_pipeline("github_3", dev_mode=True) github_data = github() # set keys names that do not exist in the data github_data.load_issues.apply_hints(merge_key=("mA1", "Ma2"), primary_key=("123-x",)) @@ -318,7 +318,7 @@ def test_merge_keys_non_existing_columns(destination_config: DestinationTestConf ids=lambda x: x.name, ) def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("github_3", full_refresh=True) + p = destination_config.setup_pipeline("github_3", dev_mode=True) github_data = github() # generate some complex types github_data.max_table_nesting = 2 @@ -447,7 +447,7 @@ def _updated_event(node_id): ] # load to destination - p = destination_config.setup_pipeline("github_3", full_refresh=True) + p = destination_config.setup_pipeline("github_3", dev_mode=True) info = p.run( _get_shuffled_events(True) | github_resource, loader_file_format=destination_config.file_format, @@ -507,7 +507,7 @@ def _updated_event(node_id): "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_deduplicate_single_load(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource(write_disposition="merge", primary_key="id") def duplicates(): @@ -538,7 +538,7 @@ def duplicates_no_child(): "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) def test_no_deduplicate_only_merge_key(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource(write_disposition="merge", merge_key="id") def duplicates(): @@ -575,7 +575,7 @@ def test_complex_column_missing(destination_config: DestinationTestConfiguration def r(data): yield data - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) data = [{"id": 1, "simple": "foo", "complex": [1, 2, 3]}] info = p.run(r(data), loader_file_format=destination_config.file_format) @@ -618,7 +618,7 @@ def data_resource(data): # we test what happens if there are no merge keys pass - p = destination_config.setup_pipeline(f"abstract_{key_type}", full_refresh=True) + p = destination_config.setup_pipeline(f"abstract_{key_type}", dev_mode=True) # insert two records data = [ @@ -766,7 +766,7 @@ def test_hard_delete_hint_config(destination_config: DestinationTestConfiguratio def data_resource(data): yield data - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) # insert two records data = [ @@ -828,7 +828,7 @@ def test_dedup_sort_hint(destination_config: DestinationTestConfiguration) -> No def data_resource(data): yield data - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) # three records with same primary key data = [ diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index d98f335d16..ad44cd6f5c 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -217,7 +217,7 @@ def _data(): for d in data: yield d - p = destination_config.setup_pipeline("test_skip_sync_schema_for_tables", full_refresh=True) + p = destination_config.setup_pipeline("test_skip_sync_schema_for_tables", dev_mode=True) p.extract(_data) schema = p.default_schema assert "data_table" in schema.tables @@ -240,7 +240,7 @@ def _data(): destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), ids=lambda x: x.name, ) -def test_run_full_refresh(destination_config: DestinationTestConfiguration) -> None: +def test_run_dev_mode(destination_config: DestinationTestConfiguration) -> None: data = ["a", ["a", "b", "c"], ["a", "b", "c"]] destination_config.setup() @@ -251,7 +251,7 @@ def d(): def _data(): return dlt.resource(d(), name="lists", write_disposition="replace") - p = dlt.pipeline(full_refresh=True) + p = dlt.pipeline(dev_mode=True) info = p.run( _data(), destination=destination_config.destination, @@ -267,7 +267,7 @@ def _data(): # restore the pipeline p = dlt.attach() # restored pipeline should be never put in full refresh - assert p.full_refresh is False + assert p.dev_mode is False # assert parent table (easy), None First (db order) assert_table(p, "lists", [None, None, "a"], info=info) # child tables contain nested lists @@ -456,7 +456,7 @@ def test_dataset_name_change(destination_config: DestinationTestConfiguration) - ds_2_name = "IteRation" + uniq_id() # illegal name that will be later normalized ds_3_name = "1it/era 👍 tion__" + uniq_id() - p, s = simple_nested_pipeline(destination_config, dataset_name=ds_1_name, full_refresh=False) + p, s = simple_nested_pipeline(destination_config, dataset_name=ds_1_name, dev_mode=False) try: info = p.run(s(), loader_file_format=destination_config.file_format) assert_load_info(info) @@ -589,7 +589,7 @@ def conflict(): # conflict deselected assert "conflict" not in discover_2.tables - p = dlt.pipeline(pipeline_name="multi", destination="duckdb", full_refresh=True) + p = dlt.pipeline(pipeline_name="multi", destination="duckdb", dev_mode=True) p.extract([source_1(), source_2()]) default_schema = p.default_schema gen1_table = default_schema.tables["gen1"] @@ -614,7 +614,7 @@ def conflict(): drop_active_pipeline_data() # same pipeline but enable conflict - p = dlt.pipeline(pipeline_name="multi", destination="duckdb", full_refresh=True) + p = dlt.pipeline(pipeline_name="multi", destination="duckdb", dev_mode=True) with pytest.raises(PipelineStepFailed) as py_ex: p.extract([source_1(), source_2().with_resources("conflict")]) assert isinstance(py_ex.value.__context__, CannotCoerceColumnException) @@ -902,7 +902,7 @@ def test_pipeline_upfront_tables_two_loads( pipeline = destination_config.setup_pipeline( "test_pipeline_upfront_tables_two_loads", dataset_name="test_pipeline_upfront_tables_two_loads", - full_refresh=True, + dev_mode=True, ) @dlt.source @@ -1052,7 +1052,7 @@ def table_3(make_data=False): # pipeline = destination_config.setup_pipeline( # "test_load_non_utc_timestamps", # dataset_name="test_load_non_utc_timestamps", -# full_refresh=True, +# dev_mode=True, # ) # info = pipeline.run(some_data()) # # print(pipeline.default_schema.to_pretty_yaml()) @@ -1062,7 +1062,7 @@ def table_3(make_data=False): def simple_nested_pipeline( - destination_config: DestinationTestConfiguration, dataset_name: str, full_refresh: bool + destination_config: DestinationTestConfiguration, dataset_name: str, dev_mode: bool ) -> Tuple[dlt.Pipeline, Callable[[], DltSource]]: data = ["a", ["a", "b", "c"], ["a", "b", "c"]] @@ -1075,7 +1075,7 @@ def _data(): p = dlt.pipeline( pipeline_name=f"pipeline_{dataset_name}", - full_refresh=full_refresh, + dev_mode=dev_mode, destination=destination_config.destination, staging=destination_config.staging, dataset_name=dataset_name, diff --git a/tests/load/pipeline/test_redshift.py b/tests/load/pipeline/test_redshift.py index 7e786f6845..29293693f5 100644 --- a/tests/load/pipeline/test_redshift.py +++ b/tests/load/pipeline/test_redshift.py @@ -18,7 +18,7 @@ ids=lambda x: x.name, ) def test_redshift_blocks_time_column(destination_config: DestinationTestConfiguration) -> None: - pipeline = destination_config.setup_pipeline("redshift_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("redshift_" + uniq_id(), dev_mode=True) column_schemas, data_types = table_update_and_row() diff --git a/tests/load/pipeline/test_refresh_modes.py b/tests/load/pipeline/test_refresh_modes.py new file mode 100644 index 0000000000..02ed560068 --- /dev/null +++ b/tests/load/pipeline/test_refresh_modes.py @@ -0,0 +1,439 @@ +from typing import Any, List + +import pytest +import dlt +from dlt.common.pipeline import resource_state +from dlt.destinations.sql_client import DBApiCursor +from dlt.pipeline.state_sync import load_pipeline_state_from_destination +from dlt.common.typing import DictStrAny +from dlt.common.pipeline import pipeline_state as current_pipeline_state + +from tests.utils import clean_test_storage, preserve_environ +from tests.pipeline.utils import ( + assert_load_info, + load_tables_to_dicts, + assert_only_table_columns, + table_exists, +) +from tests.load.utils import destinations_configs, DestinationTestConfiguration + + +def assert_source_state_is_wiped(state: DictStrAny) -> None: + # Keys contains only "resources" or is empty + assert list(state.keys()) == ["resources"] or not state + for value in state["resources"].values(): + assert not value + + +def column_values(cursor: DBApiCursor, column_name: str) -> List[Any]: + """Return all values in a column from a cursor""" + idx = [c[0] for c in cursor.native_cursor.description].index(column_name) + return [row[idx] for row in cursor.fetchall()] + + +@dlt.source +def refresh_source(first_run: bool = True, drop_sources: bool = False): + @dlt.resource + def some_data_1(): + if first_run: + # Set some source and resource state + dlt.state()["source_key_1"] = "source_value_1" + resource_state("some_data_1")["run1_1"] = "value1_1" + resource_state("some_data_1")["run1_2"] = "value1_2" + yield {"id": 1, "name": "John"} + yield {"id": 2, "name": "Jane"} + else: + # Check state is cleared for this resource + assert not resource_state("some_data_1") + if drop_sources: + assert_source_state_is_wiped(dlt.state()) + # Second dataset without name column to test tables are re-created + yield {"id": 3} + yield {"id": 4} + + @dlt.resource + def some_data_2(): + if first_run: + dlt.state()["source_key_2"] = "source_value_2" + resource_state("some_data_2")["run1_1"] = "value1_1" + resource_state("some_data_2")["run1_2"] = "value1_2" + yield {"id": 5, "name": "Joe"} + yield {"id": 6, "name": "Jill"} + else: + assert not resource_state("some_data_2") + if drop_sources: + assert_source_state_is_wiped(dlt.state()) + yield {"id": 7} + yield {"id": 8} + + @dlt.resource + def some_data_3(): + if first_run: + dlt.state()["source_key_3"] = "source_value_3" + resource_state("some_data_3")["run1_1"] = "value1_1" + yield {"id": 9, "name": "Jack"} + yield {"id": 10, "name": "Jill"} + else: + assert not resource_state("some_data_3") + if drop_sources: + assert_source_state_is_wiped(dlt.state()) + yield {"id": 11} + yield {"id": 12} + + @dlt.resource + def some_data_4(): + yield [] + + yield some_data_1 + yield some_data_2 + yield some_data_3 + yield some_data_4 + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, subset=["duckdb", "filesystem"], local_filesystem_configs=True + ), + ids=lambda x: x.name, +) +def test_refresh_drop_sources(destination_config: DestinationTestConfiguration): + pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_sources") + + # First run pipeline so destination so tables are created + info = pipeline.run(refresh_source(first_run=True, drop_sources=True)) + assert_load_info(info) + + # Second run of pipeline with only selected resources + info = pipeline.run( + refresh_source(first_run=False, drop_sources=True).with_resources( + "some_data_1", "some_data_2" + ) + ) + + assert set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) == { + "some_data_1", + "some_data_2", + # Table has never seen data and is not dropped + "some_data_4", + } + + # No "name" column should exist as table was dropped and re-created without it + assert_only_table_columns(pipeline, "some_data_1", ["id"]) + data = load_tables_to_dicts(pipeline, "some_data_1")["some_data_1"] + result = sorted([row["id"] for row in data]) + # Only rows from second run should exist + assert result == [3, 4] + + # Confirm resource tables not selected on second run got dropped + assert not table_exists(pipeline, "some_data_3") + # Loaded state is wiped + with pipeline.destination_client() as dest_client: + destination_state = load_pipeline_state_from_destination( + pipeline.pipeline_name, dest_client # type: ignore[arg-type] + ) + assert_source_state_is_wiped(destination_state["sources"]["refresh_source"]) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, local_filesystem_configs=True, subset=["duckdb", "filesystem"] + ), + ids=lambda x: x.name, +) +def test_existing_schema_hash(destination_config: DestinationTestConfiguration): + """Test when new schema is identical to a previously stored schema after dropping and re-creating tables. + The change should be detected regardless and tables are created again in destination db + """ + pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_sources") + + info = pipeline.run(refresh_source(first_run=True, drop_sources=True)) + assert_load_info(info) + first_schema_hash = pipeline.default_schema.version_hash + + # Second run with all tables dropped and only some tables re-created + info = pipeline.run( + refresh_source(first_run=False, drop_sources=True).with_resources( + "some_data_1", "some_data_2" + ) + ) + + # Just check the local schema + new_table_names = set( + t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True) + ) + assert new_table_names == {"some_data_1", "some_data_2", "some_data_4"} + + # Run again with all tables to ensure they are re-created + # The new schema in this case should match the schema of the first run exactly + info = pipeline.run(refresh_source(first_run=True, drop_sources=True)) + # Check table 3 was re-created + data = load_tables_to_dicts(pipeline, "some_data_3")["some_data_3"] + result = sorted([(row["id"], row["name"]) for row in data]) + assert result == [(9, "Jack"), (10, "Jill")] + + # Schema is identical to first schema + new_schema_hash = pipeline.default_schema.version_hash + assert new_schema_hash == first_schema_hash + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, local_filesystem_configs=True, subset=["duckdb", "filesystem"] + ), + ids=lambda x: x.name, +) +def test_refresh_drop_resources(destination_config: DestinationTestConfiguration): + # First run pipeline with load to destination so tables are created + pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_tables") + + info = pipeline.run(refresh_source(first_run=True)) + assert_load_info(info) + + # Second run of pipeline with only selected resources + info = pipeline.run( + refresh_source(first_run=False).with_resources("some_data_1", "some_data_2") + ) + + # Confirm resource tables not selected on second run are untouched + data = load_tables_to_dicts(pipeline, "some_data_3")["some_data_3"] + result = sorted([(row["id"], row["name"]) for row in data]) + assert result == [(9, "Jack"), (10, "Jill")] + + # Check the columns to ensure the name column was dropped + assert_only_table_columns(pipeline, "some_data_1", ["id"]) + data = load_tables_to_dicts(pipeline, "some_data_1")["some_data_1"] + # Only second run data + result = sorted([row["id"] for row in data]) + assert result == [3, 4] + + # Loaded state contains only keys created in second run + with pipeline.destination_client() as dest_client: + destination_state = load_pipeline_state_from_destination( + pipeline.pipeline_name, dest_client # type: ignore[arg-type] + ) + + source_state = destination_state["sources"]["refresh_source"] + # Source level state is kept + assert source_state["source_key_1"] == "source_value_1" + assert source_state["source_key_2"] == "source_value_2" + assert source_state["source_key_3"] == "source_value_3" + # Only resource excluded in second run remains + assert source_state["resources"]["some_data_3"] == {"run1_1": "value1_1"} + assert not source_state["resources"]["some_data_2"] + assert not source_state["resources"]["some_data_1"] + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, local_filesystem_configs=True, subset=["duckdb", "filesystem"] + ), + ids=lambda x: x.name, +) +def test_refresh_drop_data_only(destination_config: DestinationTestConfiguration): + """Refresh drop_data should truncate all selected tables before load""" + # First run pipeline with load to destination so tables are created + pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_data") + + info = pipeline.run(refresh_source(first_run=True), write_disposition="append") + assert_load_info(info) + + first_schema_hash = pipeline.default_schema.version_hash + + # Second run of pipeline with only selected resources + info = pipeline.run( + refresh_source(first_run=False).with_resources("some_data_1", "some_data_2"), + write_disposition="append", + ) + assert_load_info(info) + + # Schema should not be mutated + assert pipeline.default_schema.version_hash == first_schema_hash + + # Tables selected in second run are truncated and should only have data from second run + data = load_tables_to_dicts(pipeline, "some_data_1", "some_data_2", "some_data_3") + # name column still remains when table was truncated instead of dropped + # (except on filesystem where truncate and drop are the same) + if destination_config.destination == "filesystem": + result = sorted([row["id"] for row in data["some_data_1"]]) + assert result == [3, 4] + + result = sorted([row["id"] for row in data["some_data_2"]]) + assert result == [7, 8] + else: + result = sorted([(row["id"], row["name"]) for row in data["some_data_1"]]) + assert result == [(3, None), (4, None)] + + result = sorted([(row["id"], row["name"]) for row in data["some_data_2"]]) + assert result == [(7, None), (8, None)] + + # Other tables still have data from first run + result = sorted([(row["id"], row["name"]) for row in data["some_data_3"]]) + assert result == [(9, "Jack"), (10, "Jill")] + + # State of selected resources is wiped, source level state is kept + with pipeline.destination_client() as dest_client: + destination_state = load_pipeline_state_from_destination( + pipeline.pipeline_name, dest_client # type: ignore[arg-type] + ) + + source_state = destination_state["sources"]["refresh_source"] + assert source_state["source_key_1"] == "source_value_1" + assert source_state["source_key_2"] == "source_value_2" + assert source_state["source_key_3"] == "source_value_3" + assert not source_state["resources"]["some_data_1"] + assert not source_state["resources"]["some_data_2"] + assert source_state["resources"]["some_data_3"] == {"run1_1": "value1_1"} + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["duckdb"]), + ids=lambda x: x.name, +) +def test_refresh_drop_sources_multiple_sources(destination_config: DestinationTestConfiguration): + """ + Ensure only state and tables for currently selected source is dropped + """ + + @dlt.source + def refresh_source_2(first_run=True): + @dlt.resource + def source_2_data_1(): + pipeline_state, _ = current_pipeline_state(pipeline._container) + if first_run: + dlt.state()["source_2_key_1"] = "source_2_value_1" + resource_state("source_2_data_1")["run1_1"] = "value1_1" + yield {"product": "apple", "price": 1} + yield {"product": "banana", "price": 2} + else: + # First source should not have state wiped + assert ( + pipeline_state["sources"]["refresh_source"]["source_key_1"] == "source_value_1" + ) + assert pipeline_state["sources"]["refresh_source"]["resources"]["some_data_1"] == { + "run1_1": "value1_1", + "run1_2": "value1_2", + } + # Source state is wiped + assert_source_state_is_wiped(dlt.state()) + yield {"product": "orange"} + yield {"product": "pear"} + + @dlt.resource + def source_2_data_2(): + if first_run: + dlt.state()["source_2_key_2"] = "source_2_value_2" + resource_state("source_2_data_2")["run1_1"] = "value1_1" + yield {"product": "carrot", "price": 3} + yield {"product": "potato", "price": 4} + else: + assert_source_state_is_wiped(dlt.state()) + yield {"product": "cabbage"} + yield {"product": "lettuce"} + + yield source_2_data_1 + yield source_2_data_2 + + pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_sources") + + # Run both sources + info = pipeline.run( + [refresh_source(first_run=True, drop_sources=True), refresh_source_2(first_run=True)] + ) + assert_load_info(info, 2) + # breakpoint() + info = pipeline.run(refresh_source_2(first_run=False).with_resources("source_2_data_1")) + assert_load_info(info, 2) + + # Check source 1 schema still has all tables + table_names = set( + t["name"] for t in pipeline.schemas["refresh_source"].data_tables(include_incomplete=True) + ) + assert table_names == {"some_data_1", "some_data_2", "some_data_3", "some_data_4"} + + # Source 2 has only the selected tables + table_names = set( + t["name"] for t in pipeline.schemas["refresh_source_2"].data_tables(include_incomplete=True) + ) + assert table_names == {"source_2_data_1"} + + # Destination still has tables from source 1 + data = load_tables_to_dicts(pipeline, "some_data_1") + result = sorted([(row["id"], row["name"]) for row in data["some_data_1"]]) + assert result == [(1, "John"), (2, "Jane")] + + # # First table from source2 exists, with only first column + data = load_tables_to_dicts(pipeline, "source_2_data_1", schema_name="refresh_source_2") + assert_only_table_columns( + pipeline, "source_2_data_1", ["product"], schema_name="refresh_source_2" + ) + result = sorted([row["product"] for row in data["source_2_data_1"]]) + assert result == ["orange", "pear"] + + # # Second table from source 2 is gone + assert not table_exists(pipeline, "source_2_data_2", schema_name="refresh_source_2") + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, local_filesystem_configs=True, subset=["duckdb", "filesystem"] + ), + ids=lambda x: x.name, +) +def test_refresh_argument_to_run(destination_config: DestinationTestConfiguration): + pipeline = destination_config.setup_pipeline("refresh_full_test") + + info = pipeline.run(refresh_source(first_run=True)) + assert_load_info(info) + + info = pipeline.run( + refresh_source(first_run=False).with_resources("some_data_3"), + refresh="drop_sources", + ) + assert_load_info(info) + + # Check local schema to confirm refresh was at all applied + tables = set(t["name"] for t in pipeline.default_schema.data_tables()) + assert tables == {"some_data_3"} + + # Run again without refresh to confirm refresh option doesn't persist on pipeline + info = pipeline.run(refresh_source(first_run=False).with_resources("some_data_2")) + assert_load_info(info) + + # Nothing is dropped + tables = set(t["name"] for t in pipeline.default_schema.data_tables()) + assert tables == {"some_data_2", "some_data_3"} + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, local_filesystem_configs=True, subset=["duckdb", "filesystem"] + ), + ids=lambda x: x.name, +) +def test_refresh_argument_to_extract(destination_config: DestinationTestConfiguration): + pipeline = destination_config.setup_pipeline("refresh_full_test") + + info = pipeline.run(refresh_source(first_run=True)) + assert_load_info(info) + + pipeline.extract( + refresh_source(first_run=False).with_resources("some_data_3"), + refresh="drop_sources", + ) + + tables = set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) + # All other data tables removed + assert tables == {"some_data_3", "some_data_4"} + + # Run again without refresh to confirm refresh option doesn't persist on pipeline + pipeline.extract(refresh_source(first_run=False).with_resources("some_data_2")) + + tables = set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) + assert tables == {"some_data_2", "some_data_3", "some_data_4"} diff --git a/tests/load/pipeline/test_replace_disposition.py b/tests/load/pipeline/test_replace_disposition.py index f3b58aa5f6..464b5aea1f 100644 --- a/tests/load/pipeline/test_replace_disposition.py +++ b/tests/load/pipeline/test_replace_disposition.py @@ -260,7 +260,7 @@ def test_replace_table_clearing( os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy pipeline = destination_config.setup_pipeline( - "test_replace_table_clearing", dataset_name="test_replace_table_clearing", full_refresh=True + "test_replace_table_clearing", dataset_name="test_replace_table_clearing", dev_mode=True ) @dlt.resource(name="main_resource", write_disposition="replace", primary_key="id") diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index a2d00001c2..b287619e8c 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -355,7 +355,7 @@ def some_data(): # full refresh will not restore pipeline even if requested p._wipe_working_folder() p = destination_config.setup_pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, full_refresh=True + pipeline_name=pipeline_name, dataset_name=dataset_name, dev_mode=True ) p.run(loader_file_format=destination_config.file_format) assert p.default_schema_name is None diff --git a/tests/load/pipeline/test_write_disposition_changes.py b/tests/load/pipeline/test_write_disposition_changes.py index b9c2b35717..16c589352e 100644 --- a/tests/load/pipeline/test_write_disposition_changes.py +++ b/tests/load/pipeline/test_write_disposition_changes.py @@ -27,7 +27,7 @@ def data_with_subtables(offset: int) -> Any: ) def test_switch_from_merge(destination_config: DestinationTestConfiguration): pipeline = destination_config.setup_pipeline( - pipeline_name="test_switch_from_merge", full_refresh=True + pipeline_name="test_switch_from_merge", dev_mode=True ) info = pipeline.run( @@ -96,7 +96,7 @@ def test_switch_from_merge(destination_config: DestinationTestConfiguration): @pytest.mark.parametrize("with_root_key", [True, False]) def test_switch_to_merge(destination_config: DestinationTestConfiguration, with_root_key: bool): pipeline = destination_config.setup_pipeline( - pipeline_name="test_switch_to_merge", full_refresh=True + pipeline_name="test_switch_to_merge", dev_mode=True ) @dlt.source() diff --git a/tests/load/qdrant/test_pipeline.py b/tests/load/qdrant/test_pipeline.py index fcc8fcbd71..d50b50282a 100644 --- a/tests/load/qdrant/test_pipeline.py +++ b/tests/load/qdrant/test_pipeline.py @@ -301,7 +301,7 @@ def some_data(): def test_merge_github_nested() -> None: - p = dlt.pipeline(destination="qdrant", dataset_name="github1", full_refresh=True) + p = dlt.pipeline(destination="qdrant", dataset_name="github1", dev_mode=True) assert p.dataset_name.startswith("github1_202") with open( @@ -347,7 +347,7 @@ def test_merge_github_nested() -> None: def test_empty_dataset_allowed() -> None: # dataset_name is optional so dataset name won't be autogenerated when not explicitly passed - p = dlt.pipeline(destination="qdrant", full_refresh=True) + p = dlt.pipeline(destination="qdrant", dev_mode=True) client: QdrantClient = p.destination_client() # type: ignore[assignment] assert p.dataset_name is None diff --git a/tests/load/synapse/test_synapse_table_indexing.py b/tests/load/synapse/test_synapse_table_indexing.py index c9ecba17a1..a9d426ad4a 100644 --- a/tests/load/synapse/test_synapse_table_indexing.py +++ b/tests/load/synapse/test_synapse_table_indexing.py @@ -52,7 +52,7 @@ def items_without_table_index_type_specified() -> Iterator[Any]: pipeline_name=f"test_default_table_index_type_{table_index_type}", destination="synapse", dataset_name=f"test_default_table_index_type_{table_index_type}", - full_refresh=True, + dev_mode=True, ) job_client = pipeline.destination_client() @@ -118,7 +118,7 @@ def items_with_table_index_type_specified() -> Iterator[Any]: pipeline_name=f"test_table_index_type_{table_index_type}", destination="synapse", dataset_name=f"test_table_index_type_{table_index_type}", - full_refresh=True, + dev_mode=True, ) # An invalid value for `table_index_type` should raise a ValueError. diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 08b80af928..7e360a6664 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -327,16 +327,18 @@ def test_drop_tables(client: SqlJobClientBase) -> None: del schema.tables[tbl] schema._bump_version() client.drop_tables(*tables_to_drop) + client._update_schema_in_storage(schema) # Schema was deleted, load it in again if isinstance(client, WithStagingDataset): with contextlib.suppress(DatabaseUndefinedRelation): with client.with_staging_dataset(): - client.drop_tables(*tables_to_drop, replace_schema=False) + client.drop_tables(*tables_to_drop, delete_schema=False) # drop again - should not break anything client.drop_tables(*tables_to_drop) + client._update_schema_in_storage(schema) if isinstance(client, WithStagingDataset): with contextlib.suppress(DatabaseUndefinedRelation): with client.with_staging_dataset(): - client.drop_tables(*tables_to_drop, replace_schema=False) + client.drop_tables(*tables_to_drop, delete_schema=False) # Verify requested tables are dropped for tbl in tables_to_drop: diff --git a/tests/load/utils.py b/tests/load/utils.py index 445f8d815b..5a999dc1b7 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -153,7 +153,7 @@ def setup(self) -> None: os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" def setup_pipeline( - self, pipeline_name: str, dataset_name: str = None, full_refresh: bool = False, **kwargs + self, pipeline_name: str, dataset_name: str = None, dev_mode: bool = False, **kwargs ) -> dlt.Pipeline: """Convenience method to setup pipeline with this configuration""" self.setup() @@ -162,7 +162,7 @@ def setup_pipeline( destination=self.destination, staging=self.staging, dataset_name=dataset_name or pipeline_name, - full_refresh=full_refresh, + dev_mode=dev_mode, **kwargs, ) return pipeline diff --git a/tests/load/weaviate/test_pipeline.py b/tests/load/weaviate/test_pipeline.py index fb089ad174..ee42ab59d8 100644 --- a/tests/load/weaviate/test_pipeline.py +++ b/tests/load/weaviate/test_pipeline.py @@ -303,7 +303,7 @@ def some_data(): def test_merge_github_nested() -> None: - p = dlt.pipeline(destination="weaviate", dataset_name="github1", full_refresh=True) + p = dlt.pipeline(destination="weaviate", dataset_name="github1", dev_mode=True) assert p.dataset_name.startswith("github1_202") with open( @@ -352,7 +352,7 @@ def test_merge_github_nested() -> None: def test_empty_dataset_allowed() -> None: # weaviate dataset_name is optional so dataset name won't be autogenerated when not explicitly passed - p = dlt.pipeline(destination="weaviate", full_refresh=True) + p = dlt.pipeline(destination="weaviate", dev_mode=True) # check if we use localhost client: WeaviateClient = p.destination_client() # type: ignore[assignment] if "localhost" not in client.config.credentials.url: diff --git a/tests/pipeline/cases/github_pipeline/github_extract.py b/tests/pipeline/cases/github_pipeline/github_extract.py index 6be6643947..c9ed672fad 100644 --- a/tests/pipeline/cases/github_pipeline/github_extract.py +++ b/tests/pipeline/cases/github_pipeline/github_extract.py @@ -5,9 +5,7 @@ from github_pipeline import github # type: ignore[import-not-found] if __name__ == "__main__": - p = dlt.pipeline( - "dlt_github_pipeline", destination="duckdb", dataset_name="github_3", full_refresh=False - ) + p = dlt.pipeline("dlt_github_pipeline", destination="duckdb", dataset_name="github_3") github_source = github() if len(sys.argv) > 1: # load only N issues diff --git a/tests/pipeline/cases/github_pipeline/github_pipeline.py b/tests/pipeline/cases/github_pipeline/github_pipeline.py index c55bd02ba0..aa0f6d0e0e 100644 --- a/tests/pipeline/cases/github_pipeline/github_pipeline.py +++ b/tests/pipeline/cases/github_pipeline/github_pipeline.py @@ -33,9 +33,7 @@ def load_issues( if __name__ == "__main__": - p = dlt.pipeline( - "dlt_github_pipeline", destination="duckdb", dataset_name="github_3", full_refresh=False - ) + p = dlt.pipeline("dlt_github_pipeline", destination="duckdb", dataset_name="github_3") github_source = github() if len(sys.argv) > 1: # load only N issues diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 1c4383405b..fd5099af9b 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -94,15 +94,15 @@ def test_default_pipeline_dataset() -> None: assert p.dataset_name in possible_dataset_names -def test_run_full_refresh_default_dataset() -> None: - p = dlt.pipeline(full_refresh=True, destination="filesystem") +def test_run_dev_mode_default_dataset() -> None: + p = dlt.pipeline(dev_mode=True, destination="filesystem") assert p.dataset_name.endswith(p._pipeline_instance_id) # restore this pipeline - r_p = dlt.attach(full_refresh=False) + r_p = dlt.attach(dev_mode=False) assert r_p.dataset_name.endswith(p._pipeline_instance_id) # dummy does not need dataset - p = dlt.pipeline(full_refresh=True, destination="dummy") + p = dlt.pipeline(dev_mode=True, destination="dummy") assert p.dataset_name is None # simulate set new dataset p._set_destinations("filesystem") @@ -112,11 +112,11 @@ def test_run_full_refresh_default_dataset() -> None: assert p.dataset_name and p.dataset_name.endswith(p._pipeline_instance_id) -def test_run_full_refresh_underscored_dataset() -> None: - p = dlt.pipeline(full_refresh=True, dataset_name="_main_") +def test_run_dev_mode_underscored_dataset() -> None: + p = dlt.pipeline(dev_mode=True, dataset_name="_main_") assert p.dataset_name.endswith(p._pipeline_instance_id) # restore this pipeline - r_p = dlt.attach(full_refresh=False) + r_p = dlt.attach(dev_mode=False) assert r_p.dataset_name.endswith(p._pipeline_instance_id) @@ -895,7 +895,7 @@ def test_extract_all_data_types() -> None: def test_set_get_local_value() -> None: - p = dlt.pipeline(destination="dummy", full_refresh=True) + p = dlt.pipeline(destination="dummy", dev_mode=True) value = uniq_id() # value is set p.set_local_state_val(value, value) @@ -1862,8 +1862,8 @@ def _run_pipeline(pipeline, gen_) -> LoadInfo: return pipeline.run(gen_()) # declare pipelines in main thread then run them "async" - pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) - pipeline_2 = dlt.pipeline("pipeline_2", destination="duckdb", full_refresh=True) + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) + pipeline_2 = dlt.pipeline("pipeline_2", destination="duckdb", dev_mode=True) async def _run_async(): loop = asyncio.get_running_loop() @@ -1912,7 +1912,7 @@ def api_fetch(page_num): else: return [] - pipeline = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) + pipeline = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) load_info = pipeline.run(product()) assert_load_info(load_info) assert pipeline.last_trace.last_normalize_info.row_counts["product"] == 12 diff --git a/tests/pipeline/test_resources_evaluation.py b/tests/pipeline/test_resources_evaluation.py index 5a85c06462..542d0209d6 100644 --- a/tests/pipeline/test_resources_evaluation.py +++ b/tests/pipeline/test_resources_evaluation.py @@ -30,7 +30,7 @@ async def __anext__(self): # return the counter value return {"i": self.counter} - pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) pipeline_1.run(AsyncIterator, table_name="async") with pipeline_1.sql_client() as c: with c.execute_query("SELECT * FROM async") as cur: @@ -53,7 +53,7 @@ async def async_gen_resource(): await asyncio.sleep(0.1) yield {"letter": l_} - pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) # pure async function pipeline_1.run(async_gen_table(), table_name="async") @@ -81,7 +81,7 @@ async def _gen(idx): for idx_ in range(3): yield _gen(idx_) - pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) pipeline_1.run(async_inner_table(), table_name="async") with pipeline_1.sql_client() as c: with c.execute_query("SELECT * FROM async") as cur: @@ -114,7 +114,7 @@ async def async_transformer(item): "letter": item["letter"] + "t", } - pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) pipeline_1.run(async_transformer(), table_name="async") with pipeline_1.sql_client() as c: @@ -174,7 +174,7 @@ def source(): elif resource_mode == "second_async": return [sync_resource1(), async_resource2()] - pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) pipeline_1.run(source()) with pipeline_1.sql_client() as c: @@ -243,7 +243,7 @@ def resource2(): def source(): return [resource1(), resource2()] - pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", full_refresh=True) + pipeline_1 = dlt.pipeline("pipeline_1", destination="duckdb", dev_mode=True) pipeline_1.run(source()) # all records should be here diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py index 7eafb1ea24..4958299368 100644 --- a/tests/pipeline/test_schema_contracts.py +++ b/tests/pipeline/test_schema_contracts.py @@ -179,7 +179,7 @@ def get_pipeline(): pipeline_name=uniq_id(), destination="duckdb", credentials=duckdb.connect(":memory:"), - full_refresh=True, + dev_mode=True, ) diff --git a/tests/pipeline/test_schema_updates.py b/tests/pipeline/test_schema_updates.py index be397f796c..311bd55b28 100644 --- a/tests/pipeline/test_schema_updates.py +++ b/tests/pipeline/test_schema_updates.py @@ -5,7 +5,7 @@ def test_schema_updates() -> None: os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately - p = dlt.pipeline(pipeline_name="test_schema_updates", full_refresh=True, destination="dummy") + p = dlt.pipeline(pipeline_name="test_schema_updates", dev_mode=True, destination="dummy") @dlt.source() def source(): diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 072a12782c..b4dae919f8 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -14,6 +14,7 @@ from dlt.destinations.fs_client import FSClientBase from dlt.pipeline.exceptions import SqlClientNotAvailable from dlt.common.storages import FileStorage +from dlt.destinations.exceptions import DatabaseUndefinedRelation from tests.utils import TEST_STORAGE_ROOT @@ -172,12 +173,13 @@ def _load_tables_to_dicts_fs(p: dlt.Pipeline, *table_names: str) -> Dict[str, Li def _load_tables_to_dicts_sql( - p: dlt.Pipeline, *table_names: str + p: dlt.Pipeline, *table_names: str, schema_name: str = None ) -> Dict[str, List[Dict[str, Any]]]: result = {} + schema = p.default_schema if not schema_name else p.schemas[schema_name] for table_name in table_names: table_rows = [] - columns = p.default_schema.get_table_columns(table_name).keys() + columns = schema.get_table_columns(table_name).keys() query_columns = ",".join(map(p.sql_client().capabilities.escape_identifier, columns)) with p.sql_client() as c: @@ -191,9 +193,23 @@ def _load_tables_to_dicts_sql( return result -def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: - func = _load_tables_to_dicts_fs if _is_filesystem(p) else _load_tables_to_dicts_sql - return func(p, *table_names) +def load_tables_to_dicts( + p: dlt.Pipeline, *table_names: str, schema_name: str = None +) -> Dict[str, List[Dict[str, Any]]]: + if _is_filesystem(p): + return _load_tables_to_dicts_fs(p, *table_names) + return _load_tables_to_dicts_sql(p, *table_names, schema_name=schema_name) + + +def assert_only_table_columns( + p: dlt.Pipeline, table_name: str, expected_columns: Sequence[str], schema_name: str = None +) -> None: + """Table has all and only the expected columns (excluding _dlt columns)""" + rows = load_tables_to_dicts(p, table_name, schema_name=schema_name)[table_name] + assert rows, f"Table {table_name} is empty" + # Ignore _dlt columns + columns = set(col for col in rows[0].keys() if not col.startswith("_dlt")) + assert columns == set(expected_columns) # @@ -244,6 +260,22 @@ def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> No # +def table_exists(p: dlt.Pipeline, table_name: str, schema_name: str = None) -> bool: + """Returns True if table exists in the destination database/filesystem""" + if _is_filesystem(p): + client = p._fs_client(schema_name=schema_name) + files = client.list_table_files(table_name) + return not not files + + with p.sql_client(schema_name=schema_name) as c: + try: + qual_table_name = c.make_qualified_table_name(table_name) + c.execute_sql(f"SELECT 1 FROM {qual_table_name} LIMIT 1") + return True + except DatabaseUndefinedRelation: + return False + + def _assert_table_sql( p: dlt.Pipeline, table_name: str, From 701d50381ae3edb7d39019c3913b9023501fd99f Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Mon, 3 Jun 2024 10:19:15 -0400 Subject: [PATCH 05/61] Ensure arrow field's nullable flag matches the schema column (#1429) --- dlt/common/libs/pyarrow.py | 26 ++++++++--- dlt/normalize/items_normalizers.py | 4 +- tests/libs/pyarrow/test_pyarrow_normalizer.py | 31 +++++++++++++ tests/load/pipeline/test_arrow_loading.py | 45 +++++++++++++++++++ 4 files changed, 98 insertions(+), 8 deletions(-) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 58ddf69cea..d6ee5be4cd 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -223,9 +223,18 @@ def should_normalize_arrow_schema( schema: pyarrow.Schema, columns: TTableSchemaColumns, naming: NamingConvention, -) -> Tuple[bool, Mapping[str, str], Dict[str, str], TTableSchemaColumns]: +) -> Tuple[bool, Mapping[str, str], Dict[str, str], Dict[str, bool], TTableSchemaColumns]: rename_mapping = get_normalized_arrow_fields_mapping(schema, naming) rev_mapping = {v: k for k, v in rename_mapping.items()} + nullable_mapping = {k: v.get("nullable", True) for k, v in columns.items()} + # All fields from arrow schema that have nullable set to different value than in columns + # Key is the renamed column name + nullable_updates: Dict[str, bool] = {} + for field in schema: + norm_name = rename_mapping[field.name] + if norm_name in nullable_mapping and field.nullable != nullable_mapping[norm_name]: + nullable_updates[norm_name] = nullable_mapping[norm_name] + dlt_tables = list(map(naming.normalize_table_identifier, ("_dlt_id", "_dlt_load_id"))) # remove all columns that are dlt columns but are not present in arrow schema. we do not want to add such columns @@ -239,8 +248,8 @@ def should_normalize_arrow_schema( # check if nothing to rename skip_normalize = ( list(rename_mapping.keys()) == list(rename_mapping.values()) == list(columns.keys()) - ) - return not skip_normalize, rename_mapping, rev_mapping, columns + ) and not nullable_updates + return not skip_normalize, rename_mapping, rev_mapping, nullable_updates, columns def normalize_py_arrow_item( @@ -254,10 +263,11 @@ def normalize_py_arrow_item( 1. arrow schema field names will be normalized according to `naming` 2. arrows columns will be reordered according to `columns` 3. empty columns will be inserted if they are missing, types will be generated using `caps` + 4. arrow columns with different nullability than corresponding schema columns will be updated """ schema = item.schema - should_normalize, rename_mapping, rev_mapping, columns = should_normalize_arrow_schema( - schema, columns, naming + should_normalize, rename_mapping, rev_mapping, nullable_updates, columns = ( + should_normalize_arrow_schema(schema, columns, naming) ) if not should_normalize: return item @@ -270,8 +280,12 @@ def normalize_py_arrow_item( field_name = rev_mapping.pop(column_name, column_name) if field_name in rename_mapping: idx = schema.get_field_index(field_name) + new_field = schema.field(idx).with_name(column_name) + if column_name in nullable_updates: + # Set field nullable to match column + new_field = new_field.with_nullable(nullable_updates[column_name]) # use renamed field - new_fields.append(schema.field(idx).with_name(column_name)) + new_fields.append(new_field) new_columns.append(item.column(idx)) else: # column does not exist in pyarrow. create empty field and column diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 742125850d..81220da2dd 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -295,7 +295,7 @@ def _write_with_dlt_columns( items_count += batch.num_rows # we may need to normalize if is_native_arrow_writer and should_normalize is None: - should_normalize, _, _, _ = pyarrow.should_normalize_arrow_schema( + should_normalize, _, _, _, _ = pyarrow.should_normalize_arrow_schema( batch.schema, columns_schema, schema.naming ) if should_normalize: @@ -376,7 +376,7 @@ def __call__(self, extracted_items_file: str, root_table_name: str) -> List[TSch ) if not must_rewrite: # in rare cases normalization may be needed - must_rewrite, _, _, _ = pyarrow.should_normalize_arrow_schema( + must_rewrite, _, _, _, _ = pyarrow.should_normalize_arrow_schema( arrow_schema, self.schema.get_table_columns(root_table_name), self.schema.naming ) if must_rewrite: diff --git a/tests/libs/pyarrow/test_pyarrow_normalizer.py b/tests/libs/pyarrow/test_pyarrow_normalizer.py index 25871edd45..63abcbc92a 100644 --- a/tests/libs/pyarrow/test_pyarrow_normalizer.py +++ b/tests/libs/pyarrow/test_pyarrow_normalizer.py @@ -99,6 +99,37 @@ def test_default_dlt_columns_not_added() -> None: assert _row_at_index(result, 0) == [None, None, 1] +def test_non_nullable_columns() -> None: + """Tests the case where arrow table is created with incomplete schema info, + such as when converting pandas dataframe to arrow. In this case normalize + should update not-null constraints in the arrow schema. + """ + table = pa.Table.from_pylist( + [ + { + "col1": 1, + "col2": "hello", + # Include column that will be renamed by normalize + # To ensure nullable flag mapping is correct + "Col 3": "world", + }, + ] + ) + columns = [ + new_column("col1", "bigint", nullable=False), + new_column("col2", "text"), + new_column("col_3", "text", nullable=False), + ] + result = _normalize(table, columns) + + assert result.column_names == ["col1", "col2", "col_3"] + # Not-null columns are updated in arrow + assert result.schema.field("col1").nullable is False + assert result.schema.field("col_3").nullable is False + # col2 is still nullable + assert result.schema.field("col2").nullable is True + + @pytest.mark.skip(reason="Somehow this does not fail, should we add an exception??") def test_fails_if_adding_non_nullable_column() -> None: table = pa.Table.from_pylist( diff --git a/tests/load/pipeline/test_arrow_loading.py b/tests/load/pipeline/test_arrow_loading.py index c5a37ee5bb..0bddfaabee 100644 --- a/tests/load/pipeline/test_arrow_loading.py +++ b/tests/load/pipeline/test_arrow_loading.py @@ -217,3 +217,48 @@ def some_data(): # Parquet schema is written with normalized column names assert result_tbl.schema.names == expected_column_names + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, + default_staging_configs=True, + all_staging_configs=True, + default_vector_configs=True, + ), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("item_type", ["arrow-table", "pandas", "arrow-batch"]) +def test_load_arrow_with_not_null_columns( + item_type: TestDataItemFormat, destination_config: DestinationTestConfiguration +) -> None: + """Resource schema contains non-nullable columns. Arrow schema should be written accordingly""" + item, records, _ = arrow_table_all_data_types(item_type, include_json=False, include_time=False) + + @dlt.resource(primary_key="string", columns=[{"name": "int", "nullable": False}]) + def some_data(): + yield item + + pipeline = destination_config.setup_pipeline("arrow_" + uniq_id()) + + pipeline.extract(some_data()) + + norm_storage = pipeline._get_normalize_storage() + extract_files = [ + fn for fn in norm_storage.list_files_to_normalize_sorted() if fn.endswith(".parquet") + ] + assert len(extract_files) == 1 + + # Check the extracted parquet file. It should have the respective non-nullable column in schema + with norm_storage.extracted_packages.storage.open_file(extract_files[0], "rb") as f: + result_tbl = pa.parquet.read_table(f) + assert result_tbl.schema.field("string").nullable is False + assert result_tbl.schema.field("string").type == pa.string() + assert result_tbl.schema.field("int").nullable is False + assert result_tbl.schema.field("int").type == pa.int64() + + pipeline.normalize() + # Load is succesful + info = pipeline.load() + assert_load_info(info) From 710651a56351bdd4c744cabe588342390ca9b2c5 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Mon, 3 Jun 2024 16:22:35 +0200 Subject: [PATCH 06/61] fix async tests and add note to docs (#1439) --- docs/website/docs/general-usage/resource.md | 3 +++ tests/extract/test_sources.py | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index 3ab485486e..ac7f7e6b38 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -421,6 +421,9 @@ assert list(r) == list(range(10)) > 💡 If you are paremetrizing the value of `add_limit` and sometimes need it to be disabled, you can set `None` or `-1` > to disable the limiting. You can also set the limit to `0` for the resource to not yield any items. +> 💡 For internal reasons, async resources with a limit added, occassionally produce one item more than the limit +> on some runs. This behavior is not deterministic. + ### Set table name and adjust schema You can change the schema of a resource, be it standalone or as a part of a source. Look for method diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 125c699c90..cd6ee4c3d5 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -806,14 +806,15 @@ async def r_async(): sync_list = list(r) async_list = list(r_async().add_limit(limit)) - # check the expected results - assert sync_list == async_list if limit == 10: assert sync_list == list(range(10)) + # we have edge cases where the async list will have one extra item + # possibly due to timing issues, maybe some other implementation problem + assert (async_list == list(range(10))) or (async_list == list(range(11))) elif limit in [None, -1]: - assert sync_list == list(range(20)) + assert sync_list == async_list == list(range(20)) elif limit == 0: - assert sync_list == [] + assert sync_list == async_list == [] else: raise AssertionError(f"Unexpected limit: {limit}") From 47c17b2e4fab4b55bcfdd281276ae426b5915ed9 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:16:47 +0530 Subject: [PATCH 07/61] Updated for relative links (#1431) --- docs/website/docs/reference/frequently-asked-questions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/reference/frequently-asked-questions.md b/docs/website/docs/reference/frequently-asked-questions.md index 594c5e6522..e864be1c14 100644 --- a/docs/website/docs/reference/frequently-asked-questions.md +++ b/docs/website/docs/reference/frequently-asked-questions.md @@ -81,14 +81,14 @@ You can also delete it with Python using [Bigquery client.](https://cloud.google ## How can I develop a "custom" pagination tracker? -You can use `dlt.sources.incremental` to create a custom cursor for tracking pagination in data streams that lack a specific cursor field. An example can be found in the [Incremental loading with a cursor](https://deploy-preview-1204--dlt-hub-docs.netlify.app/docs/general-usage/incremental-loading#incremental-loading-with-a-cursor-field). +You can use `dlt.sources.incremental` to create a custom cursor for tracking pagination in data streams that lack a specific cursor field. An example can be found in the [Incremental loading with a cursor](../general-usage/incremental-loading.md#incremental-loading-with-a-cursor-field). Alternatively, you can manage the state directly in Python. You can access and modify the state like a standard Python dictionary: ```py state = dlt.current.resource_state() state["your_custom_key"] = "your_value" ``` -This method allows you to create custom pagination logic based on your requirements. An example of using `resource_state()` for pagination can be found [here](https://dlthub.com/docs/general-usage/incremental-loading#custom-incremental-loading-with-pipeline-state). +This method allows you to create custom pagination logic based on your requirements. An example of using `resource_state()` for pagination can be found [here](../general-usage/incremental-loading#custom-incremental-loading-with-pipeline-state). However, be cautious about overusing the state dictionary, especially in cases involving substreams for each user, as it might become unwieldy. A better strategy might involve tracking users incrementally. Then, upon updates, you only refresh the affected users' substreams entirely. This consideration helps maintain efficiency and manageability in your custom pagination implementation. From 5b9c71564e8c4b9e584e30c40d098499ee5e1a02 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 4 Jun 2024 12:26:12 +0200 Subject: [PATCH 08/61] skips non resolvable fields from appearing in sample secrets.toml, tests if dataset_name is gone (#1432) --- dlt/cli/config_toml_writer.py | 7 ++++--- tests/cli/test_init_command.py | 8 +++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dlt/cli/config_toml_writer.py b/dlt/cli/config_toml_writer.py index 7ff7f735eb..8a549a60ff 100644 --- a/dlt/cli/config_toml_writer.py +++ b/dlt/cli/config_toml_writer.py @@ -4,6 +4,7 @@ from tomlkit.container import Container as TOMLContainer from collections.abc import Sequence as C_Sequence +from dlt.common.configuration.specs.base_configuration import is_hint_not_resolved from dlt.common.pendulum import pendulum from dlt.common.configuration.specs import ( BaseConfiguration, @@ -11,7 +12,7 @@ extract_inner_hint, ) from dlt.common.data_types import py_type_to_sc_type -from dlt.common.typing import AnyType, is_final_type, is_optional_type +from dlt.common.typing import AnyType, is_optional_type class WritableConfigValue(NamedTuple): @@ -62,9 +63,9 @@ def write_value( # skip if table contains the name already if name in toml_table and not overwrite_existing: return - # do not dump final and optional fields if they are not of special interest + # do not dump nor resolvable and optional fields if they are not of special interest if ( - is_final_type(hint) or is_optional_type(hint) or default_value is not None + is_hint_not_resolved(hint) or is_optional_type(hint) or default_value is not None ) and not is_default_of_interest: return # get the inner hint to generate cool examples diff --git a/tests/cli/test_init_command.py b/tests/cli/test_init_command.py index 3d9fd0909f..03eded9da0 100644 --- a/tests/cli/test_init_command.py +++ b/tests/cli/test_init_command.py @@ -572,7 +572,13 @@ def assert_common_files( # destination is there assert secrets.get_value(destination_name, type, None, "destination") is not None # certain values are never there - for not_there in ["destination_name", "default_schema_name", "as_staging", "staging_config"]: + for not_there in [ + "destination_name", + "default_schema_name", + "as_staging", + "staging_config", + "dataset_name", + ]: assert secrets.get_value(not_there, type, None, "destination", destination_name)[0] is None return visitor, secrets From e2528898ad0cf211d0edc3a1f1434292fc155b4b Mon Sep 17 00:00:00 2001 From: rudolfix Date: Wed, 5 Jun 2024 16:14:05 +0200 Subject: [PATCH 09/61] fixes config injection edge cases (#1430) * runs motherduck init on ci * fixes edge cases for optional new types, new types of optional types and literal detection * skips streamlit tests if not installed * defines singleton sentinels for dlt.config.value and dlt.secrets.value * uses sentinels to detect config and secret values, removes source code unparsing * converts sentinels if used in configspec to right defaults * adds SPECs to callables as attributes * simplifies and fixes nested dict update, adds dict clone in utils * adds several missing config injects tests * gives precedence to apply_hints when setting incremental, fixes resolve and merge configs, detect EMPTY incremental on resolve * moves wrapping resources in config and incremental wrappers from decorator to resources, rewraps resource on clone to separate sections and incremental instances * fixes mssql examples in sql_database docs * allows to use None as explicit value when resolving config, allows to use sentinels to request injected values * adds is_subclass working with type aliases * tests configspecs with generics --- .github/workflows/test_destinations.yml | 2 +- dlt/cli/config_toml_writer.py | 8 +- dlt/common/configuration/accessors.py | 8 +- dlt/common/configuration/container.py | 3 +- dlt/common/configuration/inject.py | 93 ++++--- dlt/common/configuration/providers/context.py | 3 +- dlt/common/configuration/resolve.py | 136 +++++---- .../configuration/specs/base_configuration.py | 44 ++- dlt/common/destination/reference.py | 3 +- dlt/common/libs/pydantic.py | 5 +- dlt/common/reflection/spec.py | 63 ++--- dlt/common/storages/live_schema_storage.py | 5 +- dlt/common/typing.py | 120 ++++++-- dlt/common/utils.py | 44 +-- .../impl/destination/configuration.py | 2 +- dlt/extract/decorators.py | 108 +++---- dlt/extract/extract.py | 8 +- dlt/extract/hints.py | 13 +- dlt/extract/incremental/__init__.py | 67 +++-- dlt/extract/resource.py | 132 +++++++-- dlt/extract/source.py | 1 + dlt/helpers/dbt/__init__.py | 6 +- dlt/helpers/dbt/configuration.py | 2 +- dlt/helpers/dbt/runner.py | 12 +- dlt/pipeline/__init__.py | 25 +- dlt/pipeline/dbt.py | 6 +- dlt/pipeline/pipeline.py | 6 +- .../verified-sources/sql_database.md | 8 +- tests/common/configuration/test_accessors.py | 6 +- .../configuration/test_configuration.py | 123 +++++++- tests/common/configuration/test_inject.py | 250 +++++++++++++++-- .../configuration/test_toml_provider.py | 4 +- tests/common/storages/test_schema_storage.py | 14 +- tests/common/test_typing.py | 47 ++++ tests/common/test_utils.py | 77 ++++- tests/extract/test_decorators.py | 45 ++- tests/extract/test_extract.py | 4 +- tests/extract/test_incremental.py | 263 +++++++++++++++--- tests/extract/test_sources.py | 10 +- tests/helpers/streamlit_tests/__init__.py | 3 + 40 files changed, 1305 insertions(+), 474 deletions(-) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index fed5c99fe1..037f9da3e5 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -28,7 +28,7 @@ env: RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }} # Test redshift and filesystem with all buckets # postgres runs again here so we can test on mac/windows - ACTIVE_DESTINATIONS: "[\"redshift\", \"postgres\", \"duckdb\", \"filesystem\", \"dummy\"]" + ACTIVE_DESTINATIONS: "[\"redshift\", \"postgres\", \"duckdb\", \"filesystem\", \"dummy\", \"motherduck\"]" jobs: get_docs_changes: diff --git a/dlt/cli/config_toml_writer.py b/dlt/cli/config_toml_writer.py index 8a549a60ff..0f8984842f 100644 --- a/dlt/cli/config_toml_writer.py +++ b/dlt/cli/config_toml_writer.py @@ -4,7 +4,7 @@ from tomlkit.container import Container as TOMLContainer from collections.abc import Sequence as C_Sequence -from dlt.common.configuration.specs.base_configuration import is_hint_not_resolved +from dlt.common.configuration.specs.base_configuration import is_hint_not_resolvable from dlt.common.pendulum import pendulum from dlt.common.configuration.specs import ( BaseConfiguration, @@ -12,7 +12,7 @@ extract_inner_hint, ) from dlt.common.data_types import py_type_to_sc_type -from dlt.common.typing import AnyType, is_optional_type +from dlt.common.typing import AnyType, is_optional_type, is_subclass class WritableConfigValue(NamedTuple): @@ -35,7 +35,7 @@ def generate_typed_example(name: str, hint: AnyType) -> Any: if sc_type == "bool": return True if sc_type == "complex": - if issubclass(inner_hint, C_Sequence): + if is_subclass(inner_hint, C_Sequence): return ["a", "b", "c"] else: table = tomlkit.table(False) @@ -65,7 +65,7 @@ def write_value( return # do not dump nor resolvable and optional fields if they are not of special interest if ( - is_hint_not_resolved(hint) or is_optional_type(hint) or default_value is not None + is_hint_not_resolvable(hint) or is_optional_type(hint) or default_value is not None ) and not is_default_of_interest: return # get the inner hint to generate cool examples diff --git a/dlt/common/configuration/accessors.py b/dlt/common/configuration/accessors.py index dfadc97fa3..1b32ae96f4 100644 --- a/dlt/common/configuration/accessors.py +++ b/dlt/common/configuration/accessors.py @@ -1,6 +1,4 @@ import abc -import contextlib -import tomlkit from typing import Any, ClassVar, List, Sequence, Tuple, Type, TypeVar from dlt.common.configuration.container import Container @@ -9,10 +7,8 @@ from dlt.common.configuration.specs import BaseConfiguration, is_base_configuration_inner_hint from dlt.common.configuration.utils import deserialize_value, log_traces, auto_cast from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext -from dlt.common.typing import AnyType, ConfigValue, TSecretValue +from dlt.common.typing import AnyType, ConfigValue, SecretValue, TSecretValue -DLT_SECRETS_VALUE = "secrets.value" -DLT_CONFIG_VALUE = "config.value" TConfigAny = TypeVar("TConfigAny", bound=Any) @@ -129,7 +125,7 @@ def writable_provider(self) -> ConfigProvider: p for p in self._get_providers_from_context() if p.is_writable and p.supports_secrets ) - value: ClassVar[Any] = ConfigValue + value: ClassVar[Any] = SecretValue "A placeholder that tells dlt to replace it with actual secret during the call to a source or resource decorated function." diff --git a/dlt/common/configuration/container.py b/dlt/common/configuration/container.py index 441b0e21bc..84d6194966 100644 --- a/dlt/common/configuration/container.py +++ b/dlt/common/configuration/container.py @@ -8,6 +8,7 @@ ContainerInjectableContextMangled, ContextDefaultCannotBeCreated, ) +from dlt.common.typing import is_subclass TConfiguration = TypeVar("TConfiguration", bound=ContainerInjectableContext) @@ -56,7 +57,7 @@ def __init__(self) -> None: def __getitem__(self, spec: Type[TConfiguration]) -> TConfiguration: # return existing config object or create it from spec - if not issubclass(spec, ContainerInjectableContext): + if not is_subclass(spec, ContainerInjectableContext): raise KeyError(f"{spec.__name__} is not a context") context, item = self._thread_getitem(spec) diff --git a/dlt/common/configuration/inject.py b/dlt/common/configuration/inject.py index 6699826ec8..c6ec5d4ddc 100644 --- a/dlt/common/configuration/inject.py +++ b/dlt/common/configuration/inject.py @@ -1,11 +1,10 @@ import inspect from functools import wraps -from typing import Callable, Dict, Type, Any, Optional, Tuple, TypeVar, overload, cast +from typing import Callable, Dict, Type, Any, Optional, Union, Tuple, TypeVar, overload, cast from inspect import Signature, Parameter -from contextlib import nullcontext -from dlt.common.typing import DictStrAny, StrAny, TFun, AnyFun +from dlt.common.typing import DictStrAny, TFun, AnyFun from dlt.common.configuration.resolve import resolve_configuration, inject_section from dlt.common.configuration.specs.base_configuration import BaseConfiguration from dlt.common.configuration.specs.config_section_context import ConfigSectionContext @@ -15,14 +14,16 @@ _LAST_DLT_CONFIG = "_dlt_config" _ORIGINAL_ARGS = "_dlt_orig_args" -# keep a registry of all the decorated functions -_FUNC_SPECS: Dict[int, Type[BaseConfiguration]] = {} - TConfiguration = TypeVar("TConfiguration", bound=BaseConfiguration) def get_fun_spec(f: AnyFun) -> Type[BaseConfiguration]: - return _FUNC_SPECS.get(id(f)) + return getattr(f, "__SPEC__", None) # type: ignore[no-any-return] + + +def set_fun_spec(f: AnyFun, spec: Type[BaseConfiguration]) -> None: + """Assigns a spec to a callable from which it was inferred""" + setattr(f, "__SPEC__", spec) # noqa: B010 @overload @@ -30,7 +31,7 @@ def with_config( func: TFun, /, spec: Type[BaseConfiguration] = None, - sections: Tuple[str, ...] = (), + sections: Union[str, Tuple[str, ...]] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, include_defaults: bool = True, @@ -46,7 +47,7 @@ def with_config( func: None = ..., /, spec: Type[BaseConfiguration] = None, - sections: Tuple[str, ...] = (), + sections: Union[str, Tuple[str, ...]] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, include_defaults: bool = True, @@ -61,7 +62,7 @@ def with_config( func: Optional[AnyFun] = None, /, spec: Type[BaseConfiguration] = None, - sections: Tuple[str, ...] = (), + sections: Union[str, Tuple[str, ...]] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, include_defaults: bool = True, @@ -88,17 +89,18 @@ def with_config( Callable[[TFun], TFun]: A decorated function """ - section_f: Callable[[StrAny], str] = None - # section may be a function from function arguments to section - if callable(sections): - section_f = sections - def decorator(f: TFun) -> TFun: SPEC: Type[BaseConfiguration] = None sig: Signature = inspect.signature(f) signature_fields: Dict[str, Any] + # find variadic kwargs to which additional arguments and injection context can be injected kwargs_arg = next( - (p for p in sig.parameters.values() if p.kind == Parameter.VAR_KEYWORD), None + ( + p + for p in sig.parameters.values() + if p.kind == Parameter.VAR_KEYWORD and p.name == "injection_kwargs" + ), + None, ) if spec is None: SPEC, signature_fields = spec_from_signature(f, sig, include_defaults, base=base) @@ -109,7 +111,7 @@ def decorator(f: TFun) -> TFun: # if no signature fields were added we will not wrap `f` for injection if len(signature_fields) == 0: # always register new function - _FUNC_SPECS[id(f)] = SPEC + set_fun_spec(f, SPEC) return f spec_arg: Parameter = None @@ -127,20 +129,23 @@ def decorator(f: TFun) -> TFun: pipeline_name_arg = p pipeline_name_arg_default = None if p.default == Parameter.empty else p.default - def resolve_config(bound_args: inspect.BoundArguments) -> BaseConfiguration: + def resolve_config( + bound_args: inspect.BoundArguments, accept_partial_: bool + ) -> BaseConfiguration: """Resolve arguments using the provided spec""" # bind parameters to signature # for calls containing resolved spec in the kwargs, we do not need to resolve again config: BaseConfiguration = None - # if section derivation function was provided then call it - if section_f: - curr_sections: Tuple[str, ...] = (section_f(bound_args.arguments),) - # sections may be a string - elif isinstance(sections, str): - curr_sections = (sections,) + curr_sections: Union[str, Tuple[str, ...]] = None + # section may be a function from function arguments to section + if callable(sections): + curr_sections = sections(bound_args.arguments) else: curr_sections = sections + # sections may be a string + if isinstance(curr_sections, str): + curr_sections = (curr_sections,) # if one of arguments is spec the use it as initial value if initial_config: @@ -162,11 +167,11 @@ def resolve_config(bound_args: inspect.BoundArguments) -> BaseConfiguration: # this may be called from many threads so section_context is thread affine with inject_section(section_context, lock_context=lock_context_on_injection): - # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections}") + # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections} in {bound_args.arguments}") return resolve_configuration( config or SPEC(), explicit_value=bound_args.arguments, - accept_partial=accept_partial, + accept_partial=accept_partial_, ) def update_bound_args( @@ -174,6 +179,7 @@ def update_bound_args( ) -> None: # overwrite or add resolved params resolved_params = dict(config) + # print("resolved_params", resolved_params) # overwrite or add resolved params for p in sig.parameters.values(): if p.name in resolved_params: @@ -191,11 +197,18 @@ def update_bound_args( def with_partially_resolved_config(config: Optional[BaseConfiguration] = None) -> Any: # creates a pre-resolved partial of the decorated function - empty_bound_args = sig.bind_partial() if not config: - config = resolve_config(empty_bound_args) - - def wrapped(*args: Any, **kwargs: Any) -> Any: + # TODO: this will not work if correct config is not provided + # esp. in case of parameters in _wrap being ConfigurationBase + # at least we should implement re-resolve with explicit parameters + # so we can merge partial we get here to combine a full config + empty_bound_args = sig.bind_partial() + # TODO: resolve partial here that will be updated in _wrap + config = resolve_config(empty_bound_args, accept_partial_=False) + + @wraps(f) + def _wrap(*args: Any, **kwargs: Any) -> Any: + # TODO: we should not change the outer config but deepcopy it nonlocal config # Do we need an exception here? @@ -213,27 +226,28 @@ def wrapped(*args: Any, **kwargs: Any) -> Any: # call the function with the pre-resolved config bound_args = sig.bind(*args, **kwargs) + # TODO: update partial config with bound_args (to cover edge cases with embedded configs) update_bound_args(bound_args, config, args, kwargs) return f(*bound_args.args, **bound_args.kwargs) - return wrapped + return _wrap @wraps(f) def _wrap(*args: Any, **kwargs: Any) -> Any: # Resolve config config: BaseConfiguration = None - bound_args = sig.bind(*args, **kwargs) + bound_args = sig.bind_partial(*args, **kwargs) if _LAST_DLT_CONFIG in kwargs: config = last_config(**kwargs) else: - config = resolve_config(bound_args) + config = resolve_config(bound_args, accept_partial_=accept_partial) # call the function with resolved config update_bound_args(bound_args, config, args, kwargs) return f(*bound_args.args, **bound_args.kwargs) # register the spec for a wrapped function - _FUNC_SPECS[id(_wrap)] = SPEC + set_fun_spec(_wrap, SPEC) # add a method to create a pre-resolved partial setattr(_wrap, "__RESOLVED_PARTIAL_FUNC__", with_partially_resolved_config) # noqa: B010 @@ -255,13 +269,14 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: return decorator(func) -def last_config(**kwargs: Any) -> Any: - """Get configuration instance used to inject function arguments""" - return kwargs[_LAST_DLT_CONFIG] +def last_config(**injection_kwargs: Any) -> Any: + """Get configuration instance used to inject function kwargs""" + return injection_kwargs[_LAST_DLT_CONFIG] -def get_orig_args(**kwargs: Any) -> Tuple[Tuple[Any], DictStrAny]: - return kwargs[_ORIGINAL_ARGS] # type: ignore +def get_orig_args(**injection_kwargs: Any) -> Tuple[Tuple[Any], DictStrAny]: + """Get original argument with which the injectable function was called""" + return injection_kwargs[_ORIGINAL_ARGS] # type: ignore def create_resolved_partial(f: AnyFun, config: Optional[BaseConfiguration] = None) -> AnyFun: diff --git a/dlt/common/configuration/providers/context.py b/dlt/common/configuration/providers/context.py index c6c1aac644..de2290540c 100644 --- a/dlt/common/configuration/providers/context.py +++ b/dlt/common/configuration/providers/context.py @@ -3,6 +3,7 @@ from dlt.common.configuration.container import Container from dlt.common.configuration.specs import ContainerInjectableContext +from dlt.common.typing import is_subclass from .provider import ConfigProvider @@ -24,7 +25,7 @@ def get_value( # only context is a valid hint with contextlib.suppress(KeyError, TypeError): - if issubclass(hint, ContainerInjectableContext): + if is_subclass(hint, ContainerInjectableContext): # contexts without defaults will raise ContextDefaultCannotBeCreated return self.container[hint], hint.__name__ diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index c9644713b5..9a4373039b 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -5,10 +5,12 @@ from dlt.common.configuration.providers.provider import ConfigProvider from dlt.common.typing import ( AnyType, + ConfigValueSentinel, StrAny, TSecretValue, get_all_types_of_class_in_union, is_optional_type, + is_subclass, is_union_type, ) @@ -20,7 +22,7 @@ is_context_inner_hint, is_base_configuration_inner_hint, is_valid_hint, - is_hint_not_resolved, + is_hint_not_resolvable, ) from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.specs.exceptions import NativeValueError @@ -87,7 +89,7 @@ def initialize_credentials(hint: Any, initial_value: Any) -> CredentialsConfigur raise return first_credentials else: - assert issubclass(hint, CredentialsConfiguration) + assert is_subclass(hint, CredentialsConfiguration) return hint.from_init_value(initial_value) # type: ignore @@ -189,67 +191,83 @@ def _resolve_config_fields( hint = config.__hint_resolvers__[key](config) # get default and explicit values default_value = getattr(config, key, None) + explicit_none = False traces: List[LookupTrace] = [] if explicit_values: - explicit_value = explicit_values.get(key) + explicit_value = None + if key in explicit_values: + # allow None to be passed in explicit values + # so we are able to reset defaults like in regular function calls + explicit_value = explicit_values[key] + explicit_none = explicit_value is None + # detect dlt.config and dlt.secrets and force injection + if isinstance(explicit_value, ConfigValueSentinel): + explicit_value = None else: - if is_hint_not_resolved(hint): + if is_hint_not_resolvable(hint): # for final fields default value is like explicit explicit_value = default_value else: explicit_value = None - # if hint is union of configurations, any of them must be resolved - specs_in_union: List[Type[BaseConfiguration]] = [] current_value = None - if is_union_type(hint): - # if union contains a type of explicit value which is not a valid hint, return it as current value - if ( - explicit_value - and not is_valid_hint(type(explicit_value)) - and get_all_types_of_class_in_union(hint, type(explicit_value)) - ): - current_value, traces = explicit_value, [] - else: - specs_in_union = get_all_types_of_class_in_union(hint, BaseConfiguration) - if not current_value: - if len(specs_in_union) > 1: - for idx, alt_spec in enumerate(specs_in_union): - # return first resolved config from an union - try: - current_value, traces = _resolve_config_field( - key, - alt_spec, - default_value, - explicit_value, - config, - config.__section__, - explicit_sections, - embedded_sections, - accept_partial, - ) - break - except ConfigFieldMissingException as cfm_ex: - # add traces from unresolved union spec - # TODO: we should group traces per hint - currently user will see all options tried without the key info - traces.extend(list(itertools.chain(*cfm_ex.traces.values()))) - except InvalidNativeValue: - # if none of specs in union parsed - if idx == len(specs_in_union) - 1: - raise - else: - current_value, traces = _resolve_config_field( - key, - hint, - default_value, - explicit_value, - config, - config.__section__, - explicit_sections, - embedded_sections, - accept_partial, - ) + # explicit none skips resolution + if not explicit_none: + # if hint is union of configurations, any of them must be resolved + specs_in_union: List[Type[BaseConfiguration]] = [] + if is_union_type(hint): + # if union contains a type of explicit value which is not a valid hint, return it as current value + if ( + explicit_value + and not is_valid_hint(type(explicit_value)) + and get_all_types_of_class_in_union( + hint, type(explicit_value), with_superclass=True + ) + ): + current_value, traces = explicit_value, [] + else: + specs_in_union = get_all_types_of_class_in_union(hint, BaseConfiguration) + if not current_value: + if len(specs_in_union) > 1: + for idx, alt_spec in enumerate(specs_in_union): + # return first resolved config from an union + try: + current_value, traces = _resolve_config_field( + key, + alt_spec, + default_value, + explicit_value, + config, + config.__section__, + explicit_sections, + embedded_sections, + accept_partial, + ) + break + except ConfigFieldMissingException as cfm_ex: + # add traces from unresolved union spec + # TODO: we should group traces per hint - currently user will see all options tried without the key info + traces.extend(list(itertools.chain(*cfm_ex.traces.values()))) + except InvalidNativeValue: + # if none of specs in union parsed + if idx == len(specs_in_union) - 1: + raise + else: + current_value, traces = _resolve_config_field( + key, + hint, + default_value, + explicit_value, + config, + config.__section__, + explicit_sections, + embedded_sections, + accept_partial, + ) + else: + # set the trace for explicit none + traces = [LookupTrace("ExplicitValues", None, key, None)] # check if hint optional is_optional = is_optional_type(hint) @@ -258,7 +276,7 @@ def _resolve_config_fields( unresolved_fields[key] = traces # set resolved value in config if default_value != current_value: - if not is_hint_not_resolved(hint): + if not is_hint_not_resolvable(hint): # ignore final types setattr(config, key, current_value) @@ -302,15 +320,15 @@ def _resolve_config_field( pass # if inner_hint is BaseConfiguration then resolve it recursively elif is_base_configuration_inner_hint(inner_hint): - if isinstance(value, BaseConfiguration): + if isinstance(default_value, BaseConfiguration): + # if default value was instance of configuration, use it as embedded initial + embedded_config = default_value + default_value = None + elif isinstance(value, BaseConfiguration): # if resolved value is instance of configuration (typically returned by context provider) embedded_config = value default_value = None value = None - elif isinstance(default_value, BaseConfiguration): - # if default value was instance of configuration, use it - embedded_config = default_value - default_value = None else: embedded_config = inner_hint() diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 0456a5374a..9ef756a2a6 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -21,7 +21,7 @@ TypeVar, Literal, ) -from typing_extensions import get_args, get_origin, dataclass_transform, Annotated, TypeAlias +from typing_extensions import get_args, get_origin, dataclass_transform from functools import wraps if TYPE_CHECKING: @@ -31,11 +31,13 @@ from dlt.common.typing import ( AnyType, + ConfigValueSentinel, TAnyClass, extract_inner_type, is_annotated, is_final_type, is_optional_type, + is_subclass, is_union_type, ) from dlt.common.data_types import py_type_to_sc_type @@ -62,7 +64,7 @@ def __bool__(self) -> bool: return self.not_resolved -def is_hint_not_resolved(hint: AnyType) -> bool: +def is_hint_not_resolvable(hint: AnyType) -> bool: """Checks if hint should NOT be resolved. Final and types annotated like >>> Annotated[str, NotResolved()] @@ -81,15 +83,15 @@ def is_hint_not_resolved(hint: AnyType) -> bool: def is_base_configuration_inner_hint(inner_hint: Type[Any]) -> bool: - return inspect.isclass(inner_hint) and issubclass(inner_hint, BaseConfiguration) + return is_subclass(inner_hint, BaseConfiguration) def is_context_inner_hint(inner_hint: Type[Any]) -> bool: - return inspect.isclass(inner_hint) and issubclass(inner_hint, ContainerInjectableContext) + return is_subclass(inner_hint, ContainerInjectableContext) def is_credentials_inner_hint(inner_hint: Type[Any]) -> bool: - return inspect.isclass(inner_hint) and issubclass(inner_hint, CredentialsConfiguration) + return is_subclass(inner_hint, CredentialsConfiguration) def get_config_if_union_hint(hint: Type[Any]) -> Type[Any]: @@ -103,7 +105,7 @@ def is_valid_hint(hint: Type[Any]) -> bool: # class vars are skipped by dataclass return True - if is_hint_not_resolved(hint): + if is_hint_not_resolvable(hint): # all hints that are not resolved are valid return True @@ -196,7 +198,7 @@ def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]: if not hasattr(cls, ann) and not ann.startswith(("__", "_abc_")): warnings.warn( f"Missing default value for field {ann} on {cls.__name__}. None assumed. All" - " fields in configspec must have default." + " fields in configspec must have defaults." ) setattr(cls, ann, None) # get all attributes without corresponding annotations @@ -223,6 +225,20 @@ def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]: # context can have any type if not is_valid_hint(hint) and not is_context: raise ConfigFieldTypeHintNotSupported(att_name, cls, hint) + # replace config / secret sentinels + if isinstance(att_value, ConfigValueSentinel): + if is_secret_hint(att_value.default_type) and not is_secret_hint(hint): + warnings.warn( + f"You indicated {att_name} to be {att_value.default_literal} but type" + " hint is not a secret" + ) + if not is_secret_hint(att_value.default_type) and is_secret_hint(hint): + warnings.warn( + f"You typed {att_name} to be a secret but" + f" {att_value.default_literal} indicates it is not" + ) + setattr(cls, att_name, None) + if isinstance(att_value, BaseConfiguration): # Wrap config defaults in default_factory to work around dataclass # blocking mutable defaults @@ -298,7 +314,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: """Yields all resolvable dataclass fields in the order they should be resolved""" # Sort dynamic type hint fields last because they depend on other values yield from sorted( - (f for f in cls.__dataclass_fields__.values() if cls.__is_valid_field(f)), + (f for f in cls.__dataclass_fields__.values() if is_valid_configspec_field(f)), key=lambda f: f.name in cls.__hint_resolvers__, ) @@ -356,7 +372,7 @@ def __iter__(self) -> Iterator[str]: """Iterator or valid key names""" return map( lambda field: field.name, - filter(lambda val: self.__is_valid_field(val), self.__dataclass_fields__.values()), + filter(lambda val: is_valid_configspec_field(val), self.__dataclass_fields__.values()), ) def __len__(self) -> int: @@ -372,14 +388,10 @@ def update(self, other: Any = (), /, **kwds: Any) -> None: # helper functions def __has_attr(self, __key: str) -> bool: - return __key in self.__dataclass_fields__ and self.__is_valid_field( + return __key in self.__dataclass_fields__ and is_valid_configspec_field( self.__dataclass_fields__[__key] ) - @staticmethod - def __is_valid_field(field: TDtcField) -> bool: - return not field.name.startswith("__") and field._field_type is dataclasses._FIELD # type: ignore - def call_method_in_mro(config, method_name: str) -> None: # python multi-inheritance is cooperative and this would require that all configurations cooperatively # call each other class_method_name. this is not at all possible as we do not know which configs in the end will @@ -397,6 +409,10 @@ def call_method_in_mro(config, method_name: str) -> None: _F_BaseConfiguration = BaseConfiguration +def is_valid_configspec_field(field: TDtcField) -> bool: + return not field.name.startswith("__") and field._field_type is dataclasses._FIELD # type: ignore + + @configspec class CredentialsConfiguration(BaseConfiguration): """Base class for all credentials. Credentials are configurations that may be stored only by providers supporting secrets.""" diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index d4cdfb729d..4919711f58 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -37,7 +37,6 @@ ) from dlt.common.configuration import configspec, resolve_configuration, known_sections, NotResolved from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration -from dlt.common.configuration.accessors import config from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.destination.exceptions import ( IdentifierTooLongException, @@ -624,7 +623,7 @@ def from_reference( return dest def client( - self, schema: Schema, initial_config: TDestinationConfig = config.value + self, schema: Schema, initial_config: TDestinationConfig = None ) -> TDestinationClient: """Returns a configured instance of the destination's job client""" return self.client_class(schema, self.configuration(initial_config)) diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index c4bf994cb9..e6af064b8f 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -28,6 +28,7 @@ extract_inner_type, is_list_generic_type, is_dict_generic_type, + is_subclass, is_union_type, ) @@ -124,7 +125,7 @@ def pydantic_to_table_schema_columns( try: data_type = py_type_to_sc_type(inner_type) except TypeError: - if issubclass(inner_type, BaseModel): + if is_subclass(inner_type, BaseModel): data_type = "complex" is_inner_type_pydantic_model = True else: @@ -250,7 +251,7 @@ def _process_annotation(t_: Type[Any]) -> Type[Any]: elif is_union_type(t_): u_t_s = tuple(_process_annotation(u_t) for u_t in extract_union_types(t_)) return Union[u_t_s] # type: ignore[return-value] - elif inspect.isclass(t_) and issubclass(t_, BaseModel): + elif is_subclass(t_, BaseModel): # types must be same before and after processing if id(t_) in _child_models: return _child_models[id(t_)] diff --git a/dlt/common/reflection/spec.py b/dlt/common/reflection/spec.py index 5c39199f63..db791c60cd 100644 --- a/dlt/common/reflection/spec.py +++ b/dlt/common/reflection/spec.py @@ -3,11 +3,9 @@ from typing import Dict, List, Tuple, Type, Any, Optional, NewType from inspect import Signature, Parameter -from dlt.common.typing import AnyType, AnyFun, TSecretValue +from dlt.common.typing import AnyType, AnyFun, ConfigValueSentinel, NoneType, TSecretValue from dlt.common.configuration import configspec, is_valid_hint, is_secret_hint from dlt.common.configuration.specs import BaseConfiguration -from dlt.common.configuration.accessors import DLT_CONFIG_VALUE, DLT_SECRETS_VALUE -from dlt.common.reflection.utils import get_func_def_node, get_literal_defaults from dlt.common.utils import get_callable_name # [^.^_]+ splits by . or _ @@ -34,8 +32,8 @@ def spec_from_signature( """Creates a SPEC on base `base1 for a function `f` with signature `sig`. All the arguments in `sig` that are valid SPEC hints and have defaults will be part of the SPEC. - Special markers for required SPEC fields `dlt.secrets.value` and `dlt.config.value` are parsed using - module source code, which is a hack and will not work for modules not imported from a file. + Special default markers for required SPEC fields `dlt.secrets.value` and `dlt.config.value` are sentinel + string values with a type set to Any during typechecking. The sentinels are defined in dlt.common.typing module. The name of a SPEC type is inferred from qualname of `f` and type will refer to `f` module and is unique for a module. NOTE: the SPECS are cached in the module by using name as an id. @@ -52,28 +50,6 @@ def spec_from_signature( MOD_SPEC: Type[BaseConfiguration] = getattr(module, spec_id) return MOD_SPEC, MOD_SPEC.get_resolvable_fields() - # find all the arguments that have following defaults - literal_defaults: Dict[str, str] = None - - def dlt_config_literal_to_type(arg_name: str) -> AnyType: - nonlocal literal_defaults - - if literal_defaults is None: - try: - node = get_func_def_node(f) - literal_defaults = get_literal_defaults(node) - except Exception: - # ignore exception during parsing. it is almost impossible to test all cases of function definitions - literal_defaults = {} - - if arg_name in literal_defaults: - literal_default = literal_defaults[arg_name] - if literal_default.endswith(DLT_CONFIG_VALUE): - return AnyType - if literal_default.endswith(DLT_SECRETS_VALUE): - return TSecretValue - return None - # synthesize configuration from the signature new_fields: Dict[str, Any] = {} sig_base_fields: Dict[str, Any] = {} @@ -87,40 +63,43 @@ def dlt_config_literal_to_type(arg_name: str) -> AnyType: ]: field_type = AnyType if p.annotation == Parameter.empty else p.annotation # keep the base fields if sig not annotated - if p.name in base_fields and field_type is AnyType and p.default is None: + if ( + p.name in base_fields + and field_type is AnyType + and isinstance(p.default, (NoneType, ConfigValueSentinel)) + ): sig_base_fields[p.name] = base_fields[p.name] continue # only valid hints and parameters with defaults are eligible if is_valid_hint(field_type) and p.default != Parameter.empty: - # try to get type from default - if field_type is AnyType and p.default is not None: - field_type = type(p.default) - # make type optional if explicit None is provided as default type_from_literal: AnyType = None + # make type optional if explicit None is provided as default if p.default is None: + # optional type + field_type = Optional[field_type] + elif isinstance(p.default, ConfigValueSentinel): # check if the defaults were attributes of the form .config.value or .secrets.value - type_from_literal = dlt_config_literal_to_type(p.name) - if type_from_literal is None: - # optional type - field_type = Optional[field_type] - elif type_from_literal is TSecretValue: + type_from_literal = p.default.default_type + if type_from_literal is TSecretValue: # override type with secret value if secrets.value - # print(f"Param {p.name} is REQUIRED: secrets literal") if not is_secret_hint(field_type): if field_type is AnyType: field_type = TSecretValue else: # generate typed SecretValue field_type = NewType("TSecretValue", field_type) # type: ignore - else: - # keep type mandatory if config.value - # print(f"Param {p.name} is REQUIRED: config literal") - pass + # remove sentinel from default + p = p.replace(default=None) + elif field_type is AnyType: + # try to get type from default + field_type = type(p.default) + if include_defaults or type_from_literal is not None: # set annotations annotations[p.name] = field_type # set field with default value new_fields[p.name] = p.default + # print(f"Param {p.name} is {field_type}: {p.default} due to {include_defaults} or {type_from_literal}") signature_fields = {**sig_base_fields, **new_fields} diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py index fb94a21b7a..fd4ecc968e 100644 --- a/dlt/common/storages/live_schema_storage.py +++ b/dlt/common/storages/live_schema_storage.py @@ -1,16 +1,13 @@ from typing import Dict, List, cast from dlt.common.schema.schema import Schema -from dlt.common.configuration.accessors import config from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage from dlt.common.storages.configuration import SchemaStorageConfiguration class LiveSchemaStorage(SchemaStorage): - def __init__( - self, config: SchemaStorageConfiguration = config.value, makedirs: bool = False - ) -> None: + def __init__(self, config: SchemaStorageConfiguration, makedirs: bool = False) -> None: self.live_schemas: Dict[str, Schema] = {} super().__init__(config, makedirs) diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 2d46f367d8..29c1b01d80 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -1,8 +1,9 @@ -from collections.abc import Mapping as C_Mapping, Sequence as C_Sequence +from collections.abc import Mapping as C_Mapping, Sequence as C_Sequence, Callable as C_Callable from datetime import datetime, date # noqa: I251 import inspect import os from re import Pattern as _REPattern +import sys from types import FunctionType, MethodType, ModuleType from typing import ( ForwardRef, @@ -27,6 +28,7 @@ IO, Iterator, Generator, + NamedTuple, ) from typing_extensions import ( @@ -49,6 +51,16 @@ # in versions of Python>=3.10. UnionType = Never +if sys.version_info[:3] >= (3, 9, 0): + from typing import _SpecialGenericAlias, _GenericAlias # type: ignore[attr-defined] + from types import GenericAlias # type: ignore[attr-defined] + + typingGenericAlias: Tuple[Any, ...] = (_GenericAlias, _SpecialGenericAlias, GenericAlias) +else: + from typing import _GenericAlias # type: ignore[attr-defined] + + typingGenericAlias = (_GenericAlias,) + from dlt.common.pendulum import timedelta, pendulum if TYPE_CHECKING: @@ -88,10 +100,6 @@ "A single data item or a list as extracted from the data source" TAnyDateTime = Union[pendulum.DateTime, pendulum.Date, datetime, date, str, float, int] """DateTime represented as pendulum/python object, ISO string or unix timestamp""" - -ConfigValue: None = None -"""value of type None indicating argument that may be injected by config provider""" - TVariantBase = TypeVar("TVariantBase", covariant=True) TVariantRV = Tuple[str, Any] VARIANT_FIELD_FORMAT = "v_%s" @@ -99,6 +107,30 @@ TSortOrder = Literal["asc", "desc"] +class ConfigValueSentinel(NamedTuple): + """Class to create singleton sentinel for config and secret injected value""" + + default_literal: str + default_type: AnyType + + def __str__(self) -> str: + return self.__repr__() + + def __repr__(self) -> str: + if self.default_literal == "dlt.config.value": + inst_ = "ConfigValue" + else: + inst_ = "SecretValue" + return f"{inst_}({self.default_literal}) awaiting injection" + + +ConfigValue: None = ConfigValueSentinel("dlt.config.value", AnyType) # type: ignore[assignment] +"""Config value indicating argument that may be injected by config provider. Evaluates to None when type checking""" + +SecretValue: None = ConfigValueSentinel("dlt.secrets.value", TSecretValue) # type: ignore[assignment] +"""Secret value indicating argument that may be injected by config provider. Evaluates to None when type checking""" + + @runtime_checkable class SupportsVariant(Protocol, Generic[TVariantBase]): """Defines variant type protocol that should be recognized by normalizers @@ -157,6 +189,10 @@ def extract_type_if_modifier(t: Type[Any]) -> Optional[Type[Any]]: return None +def extract_supertype(t: Type[Any]) -> Optional[Type[Any]]: + return getattr(t, "__supertype__", None) # type: ignore[no-any-return] + + def is_union_type(hint: Type[Any]) -> bool: # We need to handle UnionType because with Python>=3.10 # new Optional syntax was introduced which treats Optionals @@ -172,8 +208,8 @@ def is_union_type(hint: Type[Any]) -> bool: if origin is Union or origin is UnionType: return True - if hint := extract_type_if_modifier(hint): - return is_union_type(hint) + if inner_t := extract_type_if_modifier(hint): + return is_union_type(inner_t) return False @@ -184,8 +220,13 @@ def is_optional_type(t: Type[Any]) -> bool: if is_union and type(None) in get_args(t): return True - if t := extract_type_if_modifier(t): - return is_optional_type(t) + if inner_t := extract_type_if_modifier(t): + if is_optional_type(inner_t): + return True + else: + t = inner_t + if super_t := extract_supertype(t): + return is_optional_type(super_t) return False @@ -203,24 +244,37 @@ def extract_union_types(t: Type[Any], no_none: bool = False) -> List[Any]: def is_literal_type(hint: Type[Any]) -> bool: if get_origin(hint) is Literal: return True - if hint := extract_type_if_modifier(hint): - return is_literal_type(hint) + if inner_t := extract_type_if_modifier(hint): + if is_literal_type(inner_t): + return True + else: + hint = inner_t + if super_t := extract_supertype(hint): + return is_literal_type(super_t) + if is_union_type(hint) and is_optional_type(hint): + return is_literal_type(get_args(hint)[0]) + return False def is_newtype_type(t: Type[Any]) -> bool: if hasattr(t, "__supertype__"): return True - if t := extract_type_if_modifier(t): - return is_newtype_type(t) + if inner_t := extract_type_if_modifier(t): + if is_newtype_type(inner_t): + return True + else: + t = inner_t + if is_union_type(t) and is_optional_type(t): + return is_newtype_type(get_args(t)[0]) return False def is_typeddict(t: Type[Any]) -> bool: if isinstance(t, _TypedDict): return True - if t := extract_type_if_modifier(t): - return is_typeddict(t) + if inner_t := extract_type_if_modifier(t): + return is_typeddict(inner_t) return False @@ -259,7 +313,8 @@ def extract_inner_type( """ if maybe_modified := extract_type_if_modifier(hint): return extract_inner_type(maybe_modified, preserve_new_types, preserve_literal) - if is_optional_type(hint): + # make sure we deal with optional directly + if is_union_type(hint) and is_optional_type(hint): return extract_inner_type(get_args(hint)[0], preserve_new_types, preserve_literal) if is_literal_type(hint) and not preserve_literal: # assume that all literals are of the same type @@ -270,15 +325,42 @@ def extract_inner_type( return hint -def get_all_types_of_class_in_union(hint: Type[Any], cls: Type[TAny]) -> List[Type[TAny]]: - # hint is an Union that contains classes, return all classes that are a subclass or superclass of cls +def get_all_types_of_class_in_union( + hint: Any, cls: TAny, with_superclass: bool = False +) -> List[TAny]: + """if `hint` is an Union that contains classes, return all classes that are a subclass or (optionally) superclass of cls""" return [ t for t in get_args(hint) - if not is_typeddict(t) and inspect.isclass(t) and (issubclass(t, cls) or issubclass(cls, t)) + if not is_typeddict(t) and (is_subclass(t, cls) or is_subclass(cls, t) and with_superclass) ] +def is_generic_alias(tp: Any) -> bool: + """Tests if type is a generic alias ie. List[str]""" + return isinstance(tp, typingGenericAlias) and tp.__origin__ not in ( + Union, + tuple, + ClassVar, + C_Callable, + ) + + +def is_subclass(subclass: Any, cls: Any) -> bool: + """Return whether 'cls' is a derived from another class or is the same class. + + Will handle generic types by comparing their origins. + """ + if is_generic_alias(subclass): + subclass = get_origin(subclass) + if is_generic_alias(cls): + cls = get_origin(cls) + + if inspect.isclass(subclass) and inspect.isclass(cls): + return issubclass(subclass, cls) + return False + + def get_generic_type_argument_from_instance( instance: Any, sample_value: Optional[Any] ) -> Type[Any]: diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 1d3020f4dd..cb2ec4c3d9 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -15,6 +15,7 @@ Any, ContextManager, Dict, + MutableMapping, Iterator, Optional, Sequence, @@ -24,17 +25,15 @@ Mapping, List, Union, - Counter, Iterable, ) -from collections.abc import Mapping as C_Mapping from dlt.common.exceptions import DltException, ExceptionTrace, TerminalException from dlt.common.typing import AnyFun, StrAny, DictStrAny, StrStr, TAny, TFun T = TypeVar("T") -TDict = TypeVar("TDict", bound=DictStrAny) +TDict = TypeVar("TDict", bound=MutableMapping[Any, Any]) TKey = TypeVar("TKey") TValue = TypeVar("TValue") @@ -281,35 +280,36 @@ def update_dict_with_prune(dest: DictStrAny, update: StrAny) -> None: del dest[k] -def update_dict_nested(dst: TDict, src: StrAny, keep_dst_values: bool = False) -> TDict: +def update_dict_nested(dst: TDict, src: TDict, copy_src_dicts: bool = False) -> TDict: """Merges `src` into `dst` key wise. Does not recur into lists. Values in `src` overwrite `dst` if both keys exit. - Optionally (`keep_dst_values`) you can keep the `dst` value on conflict + Only `dict` and its subclasses are updated recursively. With `copy_src_dicts`, dict key:values will be deep copied, + otherwise, both dst and src will keep the same references. """ - # based on https://github.com/clarketm/mergedeep/blob/master/mergedeep/mergedeep.py - - def _is_recursive_merge(a: StrAny, b: StrAny) -> bool: - both_mapping = isinstance(a, C_Mapping) and isinstance(b, C_Mapping) - both_counter = isinstance(a, Counter) and isinstance(b, Counter) - return both_mapping and not both_counter for key in src: + src_val = src[key] if key in dst: - if _is_recursive_merge(dst[key], src[key]): + dst_val = dst[key] + if isinstance(src_val, dict) and isinstance(dst_val, dict): # If the key for both `dst` and `src` are both Mapping types (e.g. dict), then recurse. - update_dict_nested(dst[key], src[key], keep_dst_values=keep_dst_values) - elif dst[key] is src[key]: - # If a key exists in both objects and the values are `same`, the value from the `dst` object will be used. - pass - else: - if not keep_dst_values: - # if not keep then overwrite - dst[key] = src[key] + update_dict_nested(dst_val, src_val, copy_src_dicts=copy_src_dicts) + continue + + if copy_src_dicts and isinstance(src_val, dict): + dst[key] = update_dict_nested({}, src_val, True) else: - # If the key exists only in `src`, the value from the `src` object will be used. - dst[key] = src[key] + dst[key] = src_val + return dst +def clone_dict_nested(src: TDict) -> TDict: + """Clones `src` structure descending into nested dicts. Does not descend into mappings that are not dicts ie. specs instances. + Compared to `deepcopy` does not clone any other objects. Uses `update_dict_nested` internally + """ + return update_dict_nested({}, src, copy_src_dicts=True) # type: ignore[return-value] + + def map_nested_in_place(func: AnyFun, _complex: TAny) -> TAny: """Applies `func` to all elements in `_dict` recursively, replacing elements in nested dictionaries and lists in place.""" if isinstance(_complex, tuple): diff --git a/dlt/destinations/impl/destination/configuration.py b/dlt/destinations/impl/destination/configuration.py index bad7e4e3cc..c3b677058c 100644 --- a/dlt/destinations/impl/destination/configuration.py +++ b/dlt/destinations/impl/destination/configuration.py @@ -22,4 +22,4 @@ class CustomDestinationClientConfiguration(DestinationClientConfiguration): loader_file_format: TLoaderFileFormat = "typed-jsonl" batch_size: int = 10 skip_dlt_columns_and_tables: bool = True - max_table_nesting: int = 0 + max_table_nesting: Optional[int] = 0 diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 9c4076cfa7..a7246b6832 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -23,11 +23,13 @@ from dlt.common.configuration import with_config, get_fun_spec, known_sections, configspec from dlt.common.configuration.container import Container from dlt.common.configuration.exceptions import ContextDefaultCannotBeCreated +from dlt.common.configuration.inject import set_fun_spec from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs import BaseConfiguration, ContainerInjectableContext from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.exceptions import ArgumentsOverloadException from dlt.common.pipeline import PipelineContext +from dlt.common.reflection.spec import spec_from_signature from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema @@ -453,9 +455,7 @@ def resource( TDltResourceImpl instance which may be loaded, iterated or combined with other resources into a pipeline. """ - def make_resource( - _name: str, _section: str, _data: Any, incremental: IncrementalResourceWrapper = None - ) -> TDltResourceImpl: + def make_resource(_name: str, _section: str, _data: Any) -> TDltResourceImpl: table_template = make_hints( table_name, write_disposition=write_disposition or DEFAULT_WRITE_DISPOSITION, @@ -466,14 +466,6 @@ def make_resource( table_format=table_format, ) - # If custom nesting level was specified then - # we need to add it to table hints so that - # later in normalizer dlt/common/normalizers/json/relational.py - # we can override max_nesting level for the given table - if max_table_nesting is not None: - table_template.setdefault("x-normalizer", {}) # type: ignore[typeddict-item] - table_template["x-normalizer"]["max_nesting"] = max_table_nesting # type: ignore[typeddict-item] - resource = _impl_cls.from_data( _data, _name, @@ -481,8 +473,14 @@ def make_resource( table_template, selected, cast(DltResource, data_from), - incremental=incremental, + True, ) + # If custom nesting level was specified then + # we need to add it to table hints so that + # later in normalizer dlt/common/normalizers/json/relational.py + # we can override max_nesting level for the given table + if max_table_nesting is not None: + resource.max_table_nesting = max_table_nesting if parallelized: return resource.parallelize() return resource @@ -502,82 +500,52 @@ def decorator( resource_name = name if name and not callable(name) else get_callable_name(f) - # do not inject config values for inner functions, we assume that they are part of the source - SPEC: Type[BaseConfiguration] = None - # wrap source extraction function in configuration with section func_module = inspect.getmodule(f) source_section = _get_source_section_name(func_module) - - incremental: IncrementalResourceWrapper = None - sig = inspect.signature(f) - if IncrementalResourceWrapper.should_wrap(sig): - incremental = IncrementalResourceWrapper(primary_key) - incr_f = incremental.wrap(sig, f) if incremental else f - - resource_sections = (known_sections.SOURCES, source_section, resource_name) - - # standalone resource will prefer existing section context when resolving config values - # this lets the source to override those values and provide common section for all config values for resources present in that source - # for autogenerated spec do not include defaults - # NOTE: allow full config for standalone, currently some edge cases for incremental does not work - # (removing it via apply hints or explicit call) - conf_f = with_config( - incr_f, - spec=spec, - sections=resource_sections, - sections_merge_style=ConfigSectionContext.resource_merge_style, - include_defaults=spec is not None, # or standalone, - ) is_inner_resource = is_inner_callable(f) - if conf_f != incr_f and is_inner_resource and not standalone: - raise ResourceInnerCallableConfigWrapDisallowed(resource_name, source_section) - # get spec for wrapped function - SPEC = get_fun_spec(conf_f) - # store the standalone resource information + if spec is None: + # autodetect spec + SPEC, resolvable_fields = spec_from_signature( + f, inspect.signature(f), include_defaults=standalone + ) + print(SPEC, resolvable_fields, standalone) + if is_inner_resource and not standalone: + if len(resolvable_fields) > 0: + # prevent required arguments to inner functions that are not standalone + raise ResourceInnerCallableConfigWrapDisallowed(resource_name, source_section) + else: + # empty spec for inner functions - they should not be injected + SPEC = BaseConfiguration + else: + SPEC = spec + # assign spec to "f" + set_fun_spec(f, SPEC) + + # store the non-inner resource information if not is_inner_resource: _SOURCES[f.__qualname__] = SourceInfo(SPEC, f, func_module) if not standalone: # we return a DltResource that is callable and returns dlt resource when called # so it should match the signature - return make_resource(resource_name, source_section, conf_f, incremental) # type: ignore[return-value] - - # wrap the standalone resource - if data_from: - compat_wrapper, skip_args = wrap_compat_transformer, 1 - else: - compat_wrapper, skip_args = wrap_resource_gen, 0 + return make_resource(resource_name, source_section, f) # type: ignore[return-value] - @wraps(incr_f) + @wraps(f) def _wrap(*args: Any, **kwargs: Any) -> TDltResourceImpl: - _, mod_sig, bound_args = simulate_func_call(incr_f, skip_args, *args, **kwargs) + skip_args = 1 if data_from else 0 + _, mod_sig, bound_args = simulate_func_call(f, skip_args, *args, **kwargs) actual_resource_name = name(bound_args.arguments) if callable(name) else resource_name - # wrap again with an actual resource name - conf_f = with_config( - incr_f, - spec=SPEC, - sections=resource_sections[:-1] + (actual_resource_name,), - sections_merge_style=ConfigSectionContext.resource_merge_style, - ) - try: - r = make_resource( - actual_resource_name, - source_section, - compat_wrapper(actual_resource_name, conf_f, sig, *args, **kwargs), - incremental, - ) - except InvalidResourceDataTypeFunctionNotAGenerator: + r = make_resource(actual_resource_name, source_section, f) + # wrap the standalone resource + data_ = r._pipe.bind_gen(*args, **kwargs) + if isinstance(data_, DltResource): # we allow an edge case: resource can return another resource - # actually call the function to see if it contains DltResource - data_ = conf_f(*args, **kwargs) - if not isinstance(data_, DltResource): - raise r = data_ # type: ignore[assignment] # consider transformer arguments bound r._args_bound = True # keep explicit args passed - r._set_explicit_args(conf_f, mod_sig, *args, **kwargs) + r._set_explicit_args(f, mod_sig, *args, **kwargs) return r return _wrap diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 009cd8cc53..f8966c3ced 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -219,7 +219,7 @@ def _compute_metrics(self, load_id: str, source: DltSource) -> ExtractMetrics: if name == "incremental": # represent incremental as dictionary (it derives from BaseConfiguration) if isinstance(hint, IncrementalResourceWrapper): - hint = hint._incremental + hint = hint.incremental # sometimes internal incremental is not bound if hint: hints[name] = dict(hint) # type: ignore[call-overload] @@ -297,9 +297,8 @@ def _extract_single_source( load_id: str, source: DltSource, *, - max_parallel_items: int = None, - workers: int = None, - futures_poll_interval: float = None, + max_parallel_items: int, + workers: int, ) -> None: schema = source.schema collector = self.collector @@ -319,7 +318,6 @@ def _extract_single_source( source.resources.selected_pipes, max_parallel_items=max_parallel_items, workers=workers, - futures_poll_interval=futures_poll_interval, ) as pipes: left_gens = total_gens = len(pipes._sources) collector.update("Resources", 0, total_gens) diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 70ff0cc29d..287474c82c 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -24,6 +24,7 @@ new_table, ) from dlt.common.typing import TDataItem +from dlt.common.utils import clone_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys from dlt.extract.exceptions import ( DataItemRequiredForDynamicTableHints, @@ -318,7 +319,7 @@ def apply_hints( # set properties that can't be passed to make_hints if incremental is not None: - t["incremental"] = None if incremental is Incremental.EMPTY else incremental + t["incremental"] = incremental self._set_hints(t, create_table_variant) @@ -375,11 +376,11 @@ def merge_hints( @staticmethod def _clone_hints(hints_template: TResourceHints) -> TResourceHints: - t_ = copy(hints_template) - t_["columns"] = deepcopy(hints_template["columns"]) - if "schema_contract" in hints_template: - t_["schema_contract"] = deepcopy(hints_template["schema_contract"]) - return t_ + if hints_template is None: + return None + # creates a deep copy of dict structure without actually copying the objects + # deepcopy(hints_template) # + return clone_dict_nested(hints_template) # type: ignore[type-var] @staticmethod def _resolve_hint(item: TDataItem, hint: TTableHintTemplate[Any]) -> Any: diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index b6ecd2d3db..bcb6b1cc9a 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -6,8 +6,6 @@ import inspect from functools import wraps - -import dlt from dlt.common import logger from dlt.common.exceptions import MissingDependencyException from dlt.common.pendulum import pendulum @@ -20,6 +18,7 @@ extract_inner_type, get_generic_type_argument_from_instance, is_optional_type, + is_subclass, ) from dlt.common.schema.typing import TColumnNames from dlt.common.configuration import configspec, ConfigurationValueError @@ -111,7 +110,7 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa def __init__( self, - cursor_path: str = dlt.config.value, + cursor_path: str = None, initial_value: Optional[TCursorValue] = None, last_value_func: Optional[LastValueFunc[TCursorValue]] = max, primary_key: Optional[TTableHintTemplate[TColumnNames]] = None, @@ -254,14 +253,22 @@ def on_resolved(self) -> None: def parse_native_representation(self, native_value: Any) -> None: if isinstance(native_value, Incremental): - self.cursor_path = native_value.cursor_path - self.initial_value = native_value.initial_value - self.last_value_func = native_value.last_value_func - self.end_value = native_value.end_value - self.resource_name = native_value.resource_name - self._primary_key = native_value._primary_key - self.allow_external_schedulers = native_value.allow_external_schedulers - self.row_order = native_value.row_order + if self is self.EMPTY: + raise ValueError("Trying to resolve EMPTY Incremental") + if native_value is self.EMPTY: + raise ValueError( + "Do not use EMPTY Incremental as default or explicit values. Pass None to reset" + " an incremental." + ) + merged = self.merge(native_value) + self.cursor_path = merged.cursor_path + self.initial_value = merged.initial_value + self.last_value_func = merged.last_value_func + self.end_value = merged.end_value + self.resource_name = merged.resource_name + self._primary_key = merged._primary_key + self.allow_external_schedulers = merged.allow_external_schedulers + self.row_order = merged.row_order else: # TODO: Maybe check if callable(getattr(native_value, '__lt__', None)) # Passing bare value `incremental=44` gets parsed as initial_value self.initial_value = native_value @@ -440,7 +447,7 @@ def can_close(self) -> bool: def __str__(self) -> str: return ( - f"Incremental at {id(self)} for resource {self.resource_name} with cursor path:" + f"Incremental at 0x{id(self):x} for resource {self.resource_name} with cursor path:" f" {self.cursor_path} initial {self.initial_value} - {self.end_value} lv_func" f" {self.last_value_func}" ) @@ -490,6 +497,8 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: class IncrementalResourceWrapper(ItemTransform[TDataItem]): _incremental: Optional[Incremental[Any]] = None """Keeps the injectable incremental""" + _from_hints: bool = False + """If True, incremental was set explicitly from_hints""" _resource_name: str = None def __init__(self, primary_key: Optional[TTableHintTemplate[TColumnNames]] = None) -> None: @@ -516,10 +525,7 @@ def get_incremental_arg(sig: inspect.Signature) -> Optional[inspect.Parameter]: incremental_param: Optional[inspect.Parameter] = None for p in sig.parameters.values(): annotation = extract_inner_type(p.annotation) - annotation = get_origin(annotation) or annotation - if (inspect.isclass(annotation) and issubclass(annotation, Incremental)) or isinstance( - p.default, Incremental - ): + if is_subclass(annotation, Incremental) or isinstance(p.default, Incremental): incremental_param = p break return incremental_param @@ -539,8 +545,10 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: if p.name in bound_args.arguments: explicit_value = bound_args.arguments[p.name] if explicit_value is Incremental.EMPTY or p.default is Incremental.EMPTY: - # drop incremental - pass + raise ValueError( + "Do not use EMPTY Incremental as default or explicit values. Pass None to" + " reset an incremental." + ) elif isinstance(explicit_value, Incremental): # Explicit Incremental instance is merged with default # allowing e.g. to only update initial_value/end_value but keeping default cursor_path @@ -573,14 +581,9 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: new_incremental.__orig_class__ = p.annotation # type: ignore # set the incremental only if not yet set or if it was passed explicitly - # NOTE: if new incremental is resolved, it was passed via config injection # NOTE: the _incremental may be also set by applying hints to the resource see `set_template` in `DltResource` - if ( - new_incremental - and p.name in bound_args.arguments - and not new_incremental.is_resolved() - ) or not self._incremental: - self._incremental = new_incremental + if (new_incremental and p.name in bound_args.arguments) or not self._incremental: + self.set_incremental(new_incremental) if not self._incremental.is_resolved(): self._incremental.resolve() # in case of transformers the bind will be called before this wrapper is set: because transformer is called for a first time late in the pipe @@ -593,6 +596,20 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: return _wrap # type: ignore + @property + def incremental(self) -> Optional[Incremental[Any]]: + return self._incremental + + def set_incremental( + self, incremental: Optional[Incremental[Any]], from_hints: bool = False + ) -> None: + """Sets the incremental. If incremental was set from_hints, it can only be changed in the same manner""" + if self._from_hints and not from_hints: + # do not accept incremental if apply hints were used + return + self._from_hints = from_hints + self._incremental = incremental + @property def allow_external_schedulers(self) -> bool: """Allows the Incremental instance to get its initial and end values from external schedulers like Airflow""" diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index a64d5070b8..eecb570375 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -14,6 +14,7 @@ ) from typing_extensions import TypeVar, Self +from dlt.common.configuration.inject import get_fun_spec, with_config from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs import known_sections from dlt.common.configuration.specs.config_section_context import ConfigSectionContext @@ -94,7 +95,6 @@ def __init__( pipe: Pipe, hints: TResourceHints, selected: bool, - incremental: IncrementalResourceWrapper = None, section: str = None, args_bound: bool = False, ) -> None: @@ -103,8 +103,6 @@ def __init__( self._pipe = pipe self._args_bound = args_bound self._explicit_args: DictStrAny = None - if incremental and not self.incremental: - self.add_step(incremental) self.source_name = None super().__init__(hints) @@ -117,8 +115,16 @@ def from_data( hints: TResourceHints = None, selected: bool = True, data_from: Union["DltResource", Pipe] = None, - incremental: IncrementalResourceWrapper = None, + inject_config: bool = False, ) -> Self: + """Creates an instance of DltResource from compatible `data` with a given `name` and `section`. + + Internally (in the most common case) a new instance of Pipe with `name` is created from `data` and + optionally connected to an existing pipe `from_data` to form a transformer (dependent resource). + + If `inject_config` is set to True and data is a callable, the callable is wrapped in incremental and config + injection wrappers. + """ if data is None: raise InvalidResourceDataTypeIsNone(name, data, NoneType) @@ -126,7 +132,10 @@ def from_data( return data # type: ignore[return-value] if isinstance(data, Pipe): - return cls(data, hints, selected, incremental=incremental, section=section) + r_ = cls(data, hints, selected, section=section) + if inject_config: + r_._inject_config() + return r_ if callable(data): name = name or get_callable_name(data) @@ -155,14 +164,16 @@ def from_data( # create resource from iterator, iterable or generator function if isinstance(data, (Iterable, Iterator, AsyncIterable)) or callable(data): pipe = Pipe.from_data(name, data, parent=parent_pipe) - return cls( + r_ = cls( pipe, hints, selected, - incremental=incremental, section=section, args_bound=not callable(data), ) + if inject_config: + r_._inject_config() + return r_ else: # some other data type that is not supported raise InvalidResourceDataType( @@ -226,9 +237,12 @@ def max_table_nesting(self) -> Optional[int]: return max_nesting if isinstance(max_nesting, int) else None @max_table_nesting.setter - def max_table_nesting(self, value: int) -> None: - self._hints.setdefault("x-normalizer", {}) # type: ignore[typeddict-item] - self._hints["x-normalizer"]["max_nesting"] = value # type: ignore[typeddict-item] + def max_table_nesting(self, value: Optional[int]) -> None: + normalizer = self._hints.setdefault("x-normalizer", {}) # type: ignore[typeddict-item] + if value is None: + normalizer.pop("max_nesting", None) + else: + normalizer["max_nesting"] = value def pipe_data_from(self: TDltResourceImpl, data_from: Union[TDltResourceImpl, Pipe]) -> None: """Replaces the parent in the transformer resource pipe from which the data is piped.""" @@ -420,12 +434,26 @@ def _set_hints( incremental = self.incremental # try to late assign incremental if table_schema_template.get("incremental") is not None: - if incremental: - incremental._incremental = table_schema_template["incremental"] - else: + new_incremental = table_schema_template["incremental"] + # remove incremental if empty + if new_incremental is Incremental.EMPTY: + new_incremental = None + + if incremental is not None: + if isinstance(incremental, IncrementalResourceWrapper): + # replace in wrapper + incremental.set_incremental(new_incremental, from_hints=True) + else: + step_no = self._pipe.find(Incremental) + self._pipe.remove_step(step_no) + # re-add the step + incremental = None + + if incremental is None: # if there's no wrapper add incremental as a transform - incremental = table_schema_template["incremental"] # type: ignore - self.add_step(incremental) + incremental = new_incremental # type: ignore + if new_incremental: + self.add_step(new_incremental) if incremental: primary_key = table_schema_template.get("primary_key", incremental.primary_key) @@ -461,6 +489,14 @@ def bind(self: TDltResourceImpl, *args: Any, **kwargs: Any) -> TDltResourceImpl: self._set_explicit_args(orig_gen, None, *args, **kwargs) # type: ignore return self + @property + def args_bound(self) -> bool: + """Returns true if resource the parameters are bound to values. Such resource cannot be further called. + Note that resources are lazily evaluated and arguments are only formally checked. Configuration + was not yet injected as well. + """ + return self._args_bound + @property def explicit_args(self) -> StrAny: """Returns a dictionary of arguments used to parametrize the resource. Does not include defaults and injected args.""" @@ -535,20 +571,80 @@ def _set_explicit_args( except Exception: pass + def _eject_config(self) -> bool: + """Unwraps the pipe generator step from config injection and incremental wrappers by restoring the original step. + + Removes the step with incremental wrapper. Should be used before a subsequent _inject_config is called on the + same pipe to successfully wrap it with new incremental and config injection. + Note that resources with bound arguments cannot be ejected. + + """ + if not self._pipe.is_empty and not self._args_bound: + orig_gen = getattr(self._pipe.gen, "__GEN__", None) + if orig_gen: + step_no = self._pipe.find(IncrementalResourceWrapper) + if step_no >= 0: + self._pipe.remove_step(step_no) + self._pipe.replace_gen(orig_gen) + return True + return False + + def _inject_config(self) -> "DltResource": + """Wraps the pipe generation step in incremental and config injection wrappers and adds pipe step with + Incremental transform. + """ + gen = self._pipe.gen + if not callable(gen): + return self + + incremental: IncrementalResourceWrapper = None + sig = inspect.signature(gen) + if IncrementalResourceWrapper.should_wrap(sig): + incremental = IncrementalResourceWrapper(self._hints.get("primary_key")) + incr_f = incremental.wrap(sig, gen) + self.add_step(incremental) + else: + incr_f = gen + resource_sections = (known_sections.SOURCES, self.section, self.name) + # function should have associated SPEC + spec = get_fun_spec(gen) + # standalone resource will prefer existing section context when resolving config values + # this lets the source to override those values and provide common section for all config values for resources present in that source + # for autogenerated spec do not include defaults + conf_f = with_config( + incr_f, + spec=spec, + sections=resource_sections, + sections_merge_style=ConfigSectionContext.resource_merge_style, + ) + if conf_f != gen: + self._pipe.replace_gen(conf_f) + # storage the original generator to be able to eject config and incremental wrapper + # when resource is cloned + setattr(conf_f, "__GEN__", gen) # noqa: B010 + return self + def _clone( self: TDltResourceImpl, new_name: str = None, with_parent: bool = False ) -> TDltResourceImpl: - """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source""" + """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source.""" pipe = self._pipe if self._pipe and not self._pipe.is_empty: pipe = pipe._clone(new_name=new_name, with_parent=with_parent) # incremental and parent are already in the pipe (if any) - return self.__class__( + r_ = self.__class__( pipe, - deepcopy(self._hints), + self._clone_hints(self._hints), selected=self.selected, section=self.section, + args_bound=self._args_bound, ) + # try to eject and then inject configuration and incremental wrapper when resource is cloned + # this makes sure that a take config values from a right section and wrapper has a separated + # instance in the pipeline + if r_._eject_config(): + r_._inject_config() + return r_ def _get_config_section_context(self) -> ConfigSectionContext: container = Container() diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 6e78f3c5ba..658f884c40 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -178,6 +178,7 @@ class DltSource(Iterable[TDataItem]): * You can select and deselect resources that you want to load via `with_resources` method * You can access the resources (which are `DltResource` instances) as source attributes * It implements `Iterable` interface so you can get all the data from the resources yourself and without dlt pipeline present. + * It will create a DAG from resources and transformers and optimize the extraction so parent resources are extracted only once * You can get the `schema` for the source and all the resources within it. * You can use a `run` method to load the data with a default instance of dlt pipeline. * You can get source read only state for the currently active Pipeline instance diff --git a/dlt/helpers/dbt/__init__.py b/dlt/helpers/dbt/__init__.py index 4801dcd6b9..08d6c23ed1 100644 --- a/dlt/helpers/dbt/__init__.py +++ b/dlt/helpers/dbt/__init__.py @@ -6,7 +6,7 @@ from dlt.common.runners import Venv from dlt.common.destination.reference import DestinationClientDwhConfiguration from dlt.common.configuration.specs import CredentialsWithDefault -from dlt.common.typing import TSecretValue +from dlt.common.typing import TSecretValue, ConfigValue from dlt.version import get_installed_requirement_string from dlt.helpers.dbt.runner import create_runner, DBTPackageRunner @@ -84,9 +84,9 @@ def package_runner( destination_configuration: DestinationClientDwhConfiguration, working_dir: str, package_location: str, - package_repository_branch: str = None, + package_repository_branch: str = ConfigValue, package_repository_ssh_key: TSecretValue = TSecretValue(""), # noqa - auto_full_refresh_when_out_of_sync: bool = None, + auto_full_refresh_when_out_of_sync: bool = ConfigValue, ) -> DBTPackageRunner: default_profile_name = _default_profile_name(destination_configuration) return create_runner( diff --git a/dlt/helpers/dbt/configuration.py b/dlt/helpers/dbt/configuration.py index 70fa4d1ac5..bec0bace3c 100644 --- a/dlt/helpers/dbt/configuration.py +++ b/dlt/helpers/dbt/configuration.py @@ -10,7 +10,7 @@ class DBTRunnerConfiguration(BaseConfiguration): package_location: str = None package_repository_branch: Optional[str] = None - package_repository_ssh_key: TSecretValue = TSecretValue( + package_repository_ssh_key: Optional[TSecretValue] = TSecretValue( "" ) # the default is empty value which will disable custom SSH KEY package_profiles_dir: Optional[str] = None diff --git a/dlt/helpers/dbt/runner.py b/dlt/helpers/dbt/runner.py index 7b1f79dc77..c68931d7db 100644 --- a/dlt/helpers/dbt/runner.py +++ b/dlt/helpers/dbt/runner.py @@ -1,7 +1,7 @@ import os from subprocess import CalledProcessError import giturlparse -from typing import Sequence +from typing import Optional, Sequence import dlt from dlt.common import logger @@ -302,11 +302,11 @@ def create_runner( credentials: DestinationClientDwhConfiguration, working_dir: str, package_location: str = dlt.config.value, - package_repository_branch: str = None, - package_repository_ssh_key: TSecretValue = TSecretValue(""), # noqa - package_profiles_dir: str = None, - package_profile_name: str = None, - auto_full_refresh_when_out_of_sync: bool = None, + package_repository_branch: Optional[str] = None, + package_repository_ssh_key: Optional[TSecretValue] = TSecretValue(""), # noqa + package_profiles_dir: Optional[str] = None, + package_profile_name: Optional[str] = None, + auto_full_refresh_when_out_of_sync: bool = True, config: DBTRunnerConfiguration = None, ) -> DBTPackageRunner: """Creates a Python wrapper over `dbt` package present at specified location, that allows to control it (ie. run and test) from Python code. diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index f8900ae562..20ba0b07d0 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -112,11 +112,11 @@ def pipeline( credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, _impl_cls: Type[TPipeline] = Pipeline, # type: ignore[assignment] - **kwargs: Any, + **injection_kwargs: Any, ) -> TPipeline: - ensure_correct_pipeline_kwargs(pipeline, **kwargs) + ensure_correct_pipeline_kwargs(pipeline, **injection_kwargs) # call without arguments returns current pipeline - orig_args = get_orig_args(**kwargs) # original (*args, **kwargs) + orig_args = get_orig_args(**injection_kwargs) # original (*args, **kwargs) # is any of the arguments different from defaults has_arguments = bool(orig_args[0]) or any(orig_args[1].values()) @@ -136,11 +136,12 @@ def pipeline( pipelines_dir = get_dlt_pipelines_dir() destination = Destination.from_reference( - destination or kwargs["destination_type"], destination_name=kwargs["destination_name"] + destination or injection_kwargs["destination_type"], + destination_name=injection_kwargs["destination_name"], ) staging = Destination.from_reference( - staging or kwargs.get("staging_type", None), - destination_name=kwargs.get("staging_name", None), + staging or injection_kwargs.get("staging_type", None), + destination_name=injection_kwargs.get("staging_name", None), ) progress = collector_from_name(progress) @@ -158,8 +159,8 @@ def pipeline( full_refresh if full_refresh is not None else dev_mode, progress, False, - last_config(**kwargs), - kwargs["runtime"], + last_config(**injection_kwargs), + injection_kwargs["runtime"], refresh=refresh, ) # set it as current pipeline @@ -176,10 +177,10 @@ def attach( dev_mode: bool = False, credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, - **kwargs: Any, + **injection_kwargs: Any, ) -> Pipeline: """Attaches to the working folder of `pipeline_name` in `pipelines_dir` or in default directory. Requires that valid pipeline state exists in working folder.""" - ensure_correct_pipeline_kwargs(attach, **kwargs) + ensure_correct_pipeline_kwargs(attach, **injection_kwargs) full_refresh_argument_deprecated("attach", full_refresh) # if working_dir not provided use temp folder if not pipelines_dir: @@ -199,8 +200,8 @@ def attach( full_refresh if full_refresh is not None else dev_mode, progress, True, - last_config(**kwargs), - kwargs["runtime"], + last_config(**injection_kwargs), + injection_kwargs["runtime"], ) # set it as current pipeline p.activate() diff --git a/dlt/pipeline/dbt.py b/dlt/pipeline/dbt.py index e647e475ed..ee900005fd 100644 --- a/dlt/pipeline/dbt.py +++ b/dlt/pipeline/dbt.py @@ -3,7 +3,7 @@ from dlt.common.exceptions import VenvNotFound from dlt.common.runners import Venv from dlt.common.schema import Schema -from dlt.common.typing import TSecretValue +from dlt.common.typing import ConfigValue, TSecretValue from dlt.common.schema.utils import normalize_schema_name from dlt.helpers.dbt import ( @@ -52,9 +52,9 @@ def get_venv( def package( pipeline: Pipeline, package_location: str, - package_repository_branch: str = None, + package_repository_branch: str = ConfigValue, package_repository_ssh_key: TSecretValue = TSecretValue(""), # noqa - auto_full_refresh_when_out_of_sync: bool = None, + auto_full_refresh_when_out_of_sync: bool = ConfigValue, venv: Venv = None, ) -> DBTPackageRunner: """Creates a Python wrapper over `dbt` package present at specified location, that allows to control it (ie. run and test) from Python code. diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 81b50a8326..b0fb6fe57c 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -50,7 +50,7 @@ ) from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.exceptions import LoadPackageNotFound -from dlt.common.typing import TFun, TSecretValue, is_optional_type +from dlt.common.typing import ConfigValue, TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner from dlt.common.storages import ( LiveSchemaStorage, @@ -409,8 +409,8 @@ def extract( columns: TAnySchemaColumns = None, primary_key: TColumnNames = None, schema: Schema = None, - max_parallel_items: int = None, - workers: int = None, + max_parallel_items: int = ConfigValue, + workers: int = ConfigValue, schema_contract: TSchemaContract = None, refresh: Optional[TRefreshMode] = None, ) -> ExtractInfo: diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index 4a80de1bdf..de3e5f4c35 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -411,22 +411,22 @@ Here, we use the `mysql` and `pymysql` dialects to set up an SSL connection to a **To connect to an `mssql` server using Windows authentication**, include `trusted_connection=yes` in the connection string. ```toml -destination.mssql.credentials="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server" +sources.sql_database.credentials="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server" ``` **To connect to a local sql server instance running without SSL** pass `encrypt=no` parameter: ```toml -destination.mssql.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?encrypt=no&driver=ODBC+Driver+17+for+SQL+Server" +sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?encrypt=no&driver=ODBC+Driver+17+for+SQL+Server" ``` **To allow self signed SSL certificate** when you are getting `certificate verify failed:unable to get local issuer certificate`: ```toml -destination.mssql.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?TrustServerCertificate=yes&driver=ODBC+Driver+17+for+SQL+Server" +sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?TrustServerCertificate=yes&driver=ODBC+Driver+17+for+SQL+Server" ``` ***To use long strings (>8k) and avoid collation errors**: ```toml -destination.mssql.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?LongAsMax=yes&driver=ODBC+Driver+17+for+SQL+Server" +sources.sql_database.credentials="mssql+pyodbc://loader:loader@localhost/dlt_data?LongAsMax=yes&driver=ODBC+Driver+17+for+SQL+Server" ``` ## Customizations diff --git a/tests/common/configuration/test_accessors.py b/tests/common/configuration/test_accessors.py index 147d56abec..dc8761110f 100644 --- a/tests/common/configuration/test_accessors.py +++ b/tests/common/configuration/test_accessors.py @@ -19,7 +19,7 @@ from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.configuration.utils import get_resolved_traces, ResolvedValueTrace from dlt.common.runners.configuration import PoolRunnerConfiguration -from dlt.common.typing import AnyType, TSecretValue +from dlt.common.typing import AnyType, ConfigValue, SecretValue, TSecretValue from tests.utils import preserve_environ @@ -29,8 +29,8 @@ def test_accessor_singletons() -> None: - assert dlt.config.value is None - assert dlt.secrets.value is None + assert dlt.config.value is ConfigValue + assert dlt.secrets.value is SecretValue def test_getter_accessor(toml_providers: ConfigProvidersContext, environment: Any) -> None: diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index 945856e93f..dd687bee83 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -5,6 +5,7 @@ Any, Dict, Final, + Generic, List, Mapping, MutableMapping, @@ -13,16 +14,23 @@ Type, Union, ) -from typing_extensions import Annotated +from typing_extensions import Annotated, TypeVar from dlt.common import json, pendulum, Decimal, Wei from dlt.common.configuration.providers.provider import ConfigProvider -from dlt.common.configuration.specs.base_configuration import NotResolved, is_hint_not_resolved +from dlt.common.configuration.specs.base_configuration import NotResolved, is_hint_not_resolvable from dlt.common.configuration.specs.gcp_credentials import ( GcpServiceAccountCredentialsWithoutDefaults, ) from dlt.common.utils import custom_environ, get_exception_trace, get_exception_trace_chain -from dlt.common.typing import AnyType, DictStrAny, StrAny, TSecretValue, extract_inner_type +from dlt.common.typing import ( + AnyType, + ConfigValue, + DictStrAny, + StrAny, + TSecretValue, + extract_inner_type, +) from dlt.common.configuration.exceptions import ( ConfigFieldMissingTypeHintException, ConfigFieldTypeHintNotSupported, @@ -317,6 +325,32 @@ def test_explicit_values_false_when_bool() -> None: assert c.heels == "" +def test_explicit_embedded_config(environment: Any) -> None: + instr_explicit = InstrumentedConfiguration(head="h", tube=["tu", "be"], heels="xhe") + + environment["INSTRUMENTED__HEAD"] = "hed" + c = resolve.resolve_configuration( + EmbeddedConfiguration(default="X", sectioned=SectionedConfiguration(password="S")), + explicit_value={"instrumented": instr_explicit}, + ) + + # explicit value will be part of the resolved configuration + assert c.instrumented is instr_explicit + # configuration was injected from env + assert c.instrumented.head == "hed" + + # the same but with resolved + instr_explicit = InstrumentedConfiguration(head="h", tube=["tu", "be"], heels="xhe") + instr_explicit.resolve() + c = resolve.resolve_configuration( + EmbeddedConfiguration(default="X", sectioned=SectionedConfiguration(password="S")), + explicit_value={"instrumented": instr_explicit}, + ) + assert c.instrumented is instr_explicit + # but configuration is not injected + assert c.instrumented.head == "h" + + def test_default_values(environment: Any) -> None: # explicit values override the environment and all else environment["PIPELINE_NAME"] = "env name" @@ -931,12 +965,14 @@ def test_is_valid_hint() -> None: def test_is_not_resolved_hint() -> None: - assert is_hint_not_resolved(Final[ConfigFieldMissingException]) is True - assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved()]) is True - assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved(True)]) is True - assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, NotResolved(False)]) is False - assert is_hint_not_resolved(Annotated[ConfigFieldMissingException, "REQ"]) is False - assert is_hint_not_resolved(str) is False + assert is_hint_not_resolvable(Final[ConfigFieldMissingException]) is True + assert is_hint_not_resolvable(Annotated[ConfigFieldMissingException, NotResolved()]) is True + assert is_hint_not_resolvable(Annotated[ConfigFieldMissingException, NotResolved(True)]) is True + assert ( + is_hint_not_resolvable(Annotated[ConfigFieldMissingException, NotResolved(False)]) is False + ) + assert is_hint_not_resolvable(Annotated[ConfigFieldMissingException, "REQ"]) is False + assert is_hint_not_resolvable(str) is False def test_not_resolved_hint() -> None: @@ -1318,6 +1354,75 @@ class EmbeddedConfigurationWithDefaults(BaseConfiguration): assert c_resolved.conn_str.is_resolved() +def test_configuration_with_generic(environment: Dict[str, str]) -> None: + TColumn = TypeVar("TColumn", bound=str) + + @configspec + class IncrementalConfiguration(BaseConfiguration, Generic[TColumn]): + # TODO: support generics field + column: str = ConfigValue + + @configspec + class SourceConfiguration(BaseConfiguration): + name: str = ConfigValue + incremental: IncrementalConfiguration[str] = ConfigValue + + # resolve incremental + environment["COLUMN"] = "column" + c = resolve.resolve_configuration(IncrementalConfiguration[str]()) + assert c.column == "column" + + # resolve embedded config with generic + environment["INCREMENTAL__COLUMN"] = "column_i" + c2 = resolve.resolve_configuration(SourceConfiguration(name="name")) + assert c2.incremental.column == "column_i" + + # put incremental in union + @configspec + class SourceUnionConfiguration(BaseConfiguration): + name: str = ConfigValue + incremental_union: Optional[IncrementalConfiguration[str]] = ConfigValue + + c3 = resolve.resolve_configuration(SourceUnionConfiguration(name="name")) + assert c3.incremental_union is None + environment["INCREMENTAL_UNION__COLUMN"] = "column_u" + c3 = resolve.resolve_configuration(SourceUnionConfiguration(name="name")) + assert c3.incremental_union.column == "column_u" + + class Sentinel: + pass + + class SubSentinel(Sentinel): + pass + + @configspec + class SourceWideUnionConfiguration(BaseConfiguration): + name: str = ConfigValue + incremental_w_union: Union[IncrementalConfiguration[str], str, Sentinel] = ConfigValue + incremental_sub: Optional[Union[IncrementalConfiguration[str], str, SubSentinel]] = None + + with pytest.raises(ConfigFieldMissingException): + resolve.resolve_configuration(SourceWideUnionConfiguration(name="name")) + + # use explicit sentinel + sentinel = Sentinel() + c4 = resolve.resolve_configuration( + SourceWideUnionConfiguration(name="name"), explicit_value={"incremental_w_union": sentinel} + ) + assert c4.incremental_w_union is sentinel + + # instantiate incremental + environment["INCREMENTAL_W_UNION__COLUMN"] = "column_w_u" + c4 = resolve.resolve_configuration(SourceWideUnionConfiguration(name="name")) + assert c4.incremental_w_union.column == "column_w_u" # type: ignore[union-attr] + + # sentinel (of super class type) also works for hint of subclass type + c4 = resolve.resolve_configuration( + SourceWideUnionConfiguration(name="name"), explicit_value={"incremental_sub": sentinel} + ) + assert c4.incremental_sub is sentinel + + def test_configuration_with_literal_field(environment: Dict[str, str]) -> None: """Literal type fields only allow values from the literal""" environment["REFRESH"] = "not_a_refresh_mode" diff --git a/tests/common/configuration/test_inject.py b/tests/common/configuration/test_inject.py index 1aa52c1919..f0494e9898 100644 --- a/tests/common/configuration/test_inject.py +++ b/tests/common/configuration/test_inject.py @@ -6,7 +6,10 @@ from dlt.common.configuration.exceptions import ConfigFieldMissingException from dlt.common.configuration.inject import ( + _LAST_DLT_CONFIG, + _ORIGINAL_ARGS, get_fun_spec, + get_orig_args, last_config, with_config, create_resolved_partial, @@ -20,11 +23,16 @@ GcpServiceAccountCredentialsWithoutDefaults, ConnectionStringCredentials, ) -from dlt.common.configuration.specs.base_configuration import configspec, is_secret_hint +from dlt.common.configuration.specs.base_configuration import ( + CredentialsConfiguration, + configspec, + is_secret_hint, + is_valid_configspec_field, +) from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.reflection.spec import _get_spec_name_from_f -from dlt.common.typing import StrAny, TSecretValue, is_newtype_type +from dlt.common.typing import StrAny, TSecretStrValue, TSecretValue, is_newtype_type from tests.utils import preserve_environ from tests.common.configuration.utils import environment, toml_providers @@ -47,8 +55,28 @@ def f_var_env(user=dlt.config.value, path=dlt.config.value): assert path == "explicit path" # user will be injected - f_var_env(None, path="explicit path") - f_var_env(path="explicit path", user=None) + f_var_env(dlt.config.value, path="explicit path") + f_var_env(path="explicit path", user=dlt.secrets.value) + + # none will be passed and trigger config missing + with pytest.raises(ConfigFieldMissingException) as cfg_ex: + f_var_env(None, path="explicit path") + assert "user" in cfg_ex.value.traces + assert cfg_ex.value.traces["user"][0].provider == "ExplicitValues" + + +def test_explicit_none(environment: Any) -> None: + @with_config + def f_var(user: Optional[str] = "default"): + return user + + assert f_var(None) is None + assert f_var() == "default" + assert f_var(dlt.config.value) == "default" + environment["USER"] = "env user" + assert f_var() == "env user" + assert f_var(None) is None + assert f_var(dlt.config.value) == "env user" def test_default_values_are_resolved(environment: Any) -> None: @@ -83,11 +111,64 @@ def f_secret(password=dlt.secrets.value): environment["USER"] = "user" assert f_config() == "user" - assert f_config(None) == "user" + assert f_config(dlt.config.value) == "user" environment["PASSWORD"] = "password" assert f_secret() == "password" - assert f_secret(None) == "password" + assert f_secret(dlt.secrets.value) == "password" + + +def test_dlt_literals_in_spec() -> None: + @configspec + class LiteralsConfiguration(BaseConfiguration): + required_str: str = dlt.config.value + required_int: int = dlt.config.value + required_secret: TSecretStrValue = dlt.secrets.value + credentials: CredentialsConfiguration = dlt.secrets.value + optional_default: float = 1.2 + + fields = { + k: f.default + for k, f in LiteralsConfiguration.__dataclass_fields__.items() + if is_valid_configspec_field(f) + } + # make sure all special values are evaluated to None which indicate required params + assert fields == { + "required_str": None, + "required_int": None, + "required_secret": None, + "credentials": None, + "optional_default": 1.2, + } + c = LiteralsConfiguration() + assert dict(c) == fields + + # instantiate to make sure linter does not complain + c = LiteralsConfiguration("R", 0, TSecretStrValue("A"), ConnectionStringCredentials()) + assert dict(c) == { + "required_str": "R", + "required_int": 0, + "required_secret": TSecretStrValue("A"), + "credentials": ConnectionStringCredentials(), + "optional_default": 1.2, + } + + # this generates warnings + @configspec + class WrongLiteralsConfiguration(BaseConfiguration): + required_int: int = dlt.secrets.value + required_secret: TSecretStrValue = dlt.config.value + credentials: CredentialsConfiguration = dlt.config.value + + +def test_dlt_literals_defaults_none() -> None: + @with_config + def with_optional_none( + level: Optional[int] = dlt.config.value, aux: Optional[str] = dlt.secrets.value + ): + return (level, aux) + + assert with_optional_none() == (None, None) def test_inject_from_argument_section(toml_providers: ConfigProvidersContext) -> None: @@ -104,12 +185,14 @@ def f_credentials(gcp_storage: GcpServiceAccountCredentialsWithoutDefaults = dlt def test_inject_secret_value_secret_type(environment: Any) -> None: @with_config def f_custom_secret_type( - _dict: Dict[str, Any] = dlt.secrets.value, _int: int = dlt.secrets.value, **kwargs: Any + _dict: Dict[str, Any] = dlt.secrets.value, + _int: int = dlt.secrets.value, + **injection_kwargs: Any, ): # secret values were coerced into types assert _dict == {"a": 1} assert _int == 1234 - cfg = last_config(**kwargs) + cfg = last_config(**injection_kwargs) spec: Type[BaseConfiguration] = cfg.__class__ # assert that types are secret for f in ["_dict", "_int"]: @@ -124,6 +207,22 @@ def f_custom_secret_type( f_custom_secret_type() +def test_aux_not_injected_into_kwargs() -> None: + # only kwargs with name injection_kwargs receive aux info + + @configspec + class AuxTest(BaseConfiguration): + aux: str = "INFO" + + @with_config(spec=AuxTest) + def f_no_aux(**kwargs: Any): + assert "aux" not in kwargs + assert _LAST_DLT_CONFIG not in kwargs + assert _ORIGINAL_ARGS not in kwargs + + f_no_aux() + + @pytest.mark.skip("not implemented") def test_inject_with_non_injectable_param() -> None: # one of parameters in signature has not valid hint and is skipped (ie. from_pipe) @@ -356,15 +455,50 @@ def test_inject_with_str_sections() -> None: pass -@pytest.mark.skip("not implemented") -def test_inject_with_func_section() -> None: +def test_inject_with_func_section(environment: Any) -> None: # function to get sections from the arguments is provided - pass + @with_config(sections=lambda args: "dlt_" + args["name"]) # type: ignore[call-overload] + def table_info(name, password=dlt.secrets.value): + return password + + environment["DLT_USERS__PASSWORD"] = "pass" + assert table_info("users") == "pass" + + @with_config(sections=lambda args: ("dlt", args["name"])) # type: ignore[call-overload] + def table_info_2(name, password=dlt.secrets.value): + return password + + environment["DLT__CONTACTS__PASSWORD"] = "pass_x" + assert table_info_2("contacts") == "pass_x" -@pytest.mark.skip("not implemented") -def test_inject_on_class_and_methods() -> None: - pass + +def test_inject_on_class_and_methods(environment: Any) -> None: + environment["AUX"] = "DEBUG" + environment["LEVEL"] = "1" + + class AuxCallReceiver: + @with_config + def __call__(self, level: int = dlt.config.value, aux: str = dlt.config.value) -> Any: + return (level, aux) + + assert AuxCallReceiver()() == (1, "DEBUG") + + class AuxReceiver: + @with_config + def __init__(self, level: int = dlt.config.value, aux: str = dlt.config.value) -> None: + self.level = level + self.aux = aux + + @with_config + def resolve(self, level: int = dlt.config.value, aux: str = dlt.config.value) -> Any: + return (level, aux) + + kl_ = AuxReceiver() + assert kl_.level == 1 + assert kl_.aux == "DEBUG" + + assert kl_.resolve() == (1, "DEBUG") @pytest.mark.skip("not implemented") @@ -374,34 +508,96 @@ def test_set_defaults_for_positional_args() -> None: pass -@pytest.mark.skip("not implemented") def test_inject_spec_remainder_in_kwargs() -> None: # if the wrapped func contains kwargs then all the fields from spec without matching func args must be injected in kwargs - pass + @configspec + class AuxTest(BaseConfiguration): + level: int = None + aux: str = "INFO" + + @with_config(spec=AuxTest) + def f_aux(level, **injection_kwargs: Any): + # level is in args so not added to kwargs + assert level == 1 + assert "level" not in injection_kwargs + # remainder in kwargs + assert injection_kwargs["aux"] == "INFO" + # assert _LAST_DLT_CONFIG not in kwargs + # assert _ORIGINAL_ARGS not in kwargs + + f_aux(1) -@pytest.mark.skip("not implemented") def test_inject_spec_in_kwargs() -> None: - # the resolved spec is injected in kwargs - pass + @configspec + class AuxTest(BaseConfiguration): + aux: str = "INFO" + @with_config(spec=AuxTest) + def f_kw_spec(**injection_kwargs: Any): + c = last_config(**injection_kwargs) + assert c.aux == "INFO" + # no args, no kwargs + assert get_orig_args(**injection_kwargs) == ((), {}) -@pytest.mark.skip("not implemented") -def test_resolved_spec_in_kwargs_pass_through() -> None: + f_kw_spec() + + +def test_resolved_spec_in_kwargs_pass_through(environment: Any) -> None: # if last_config is in kwargs then use it and do not resolve it anew - pass + @configspec + class AuxTest(BaseConfiguration): + aux: str = "INFO" + + @with_config(spec=AuxTest) + def init_cf(aux: str = dlt.config.value, **injection_kwargs: Any): + assert aux == "DEBUG" + return last_config(**injection_kwargs) + + environment["AUX"] = "DEBUG" + c = init_cf() + + @with_config(spec=AuxTest) + def get_cf(aux: str = dlt.config.value, last_config: AuxTest = None): + assert aux == "DEBUG" + assert last_config.aux == "DEBUG" + return last_config + + # this will be ignored, last_config is regarded as resolved + environment["AUX"] = "ERROR" + assert get_cf(last_config=c) is c -@pytest.mark.skip("not implemented") def test_inject_spec_into_argument_with_spec_type() -> None: # if signature contains argument with type of SPEC, it gets injected there - pass + from dlt.destinations.impl.dummy import _configure, DummyClientConfiguration + # _configure has argument of type DummyClientConfiguration that it returns + # this type holds resolved configuration + c = _configure() + assert isinstance(c, DummyClientConfiguration) -@pytest.mark.skip("not implemented") -def test_initial_spec_from_arg_with_spec_type() -> None: + +def test_initial_spec_from_arg_with_spec_type(environment: Any) -> None: # if signature contains argument with type of SPEC, get its value to init SPEC (instead of calling the constructor()) - pass + @configspec + class AuxTest(BaseConfiguration): + level: int = None + aux: str = "INFO" + + @with_config(spec=AuxTest) + def init_cf( + level: int = dlt.config.value, aux: str = dlt.config.value, init_cf: AuxTest = None + ): + assert level == -1 + assert aux == "DEBUG" + # init_cf was used as init but also got resolved + assert init_cf.aux == "DEBUG" + return init_cf + + init_c = AuxTest(level=-1) + environment["AUX"] = "DEBUG" + assert init_cf(init_cf=init_c) is init_c def test_use_most_specific_union_type( diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index 4f2219716a..43bad21ece 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -89,7 +89,7 @@ def single_val(port=None): return port # secrets have api.port=1023 and this will be used - assert single_val(None) == 1023 + assert single_val(dlt.secrets.value) == 1023 # env will make it string, also section is optional environment["PORT"] = "UNKNOWN" @@ -110,7 +110,7 @@ def mixed_val( ): return api_type, secret_value, typecheck - _tup = mixed_val(None, None, None) + _tup = mixed_val(dlt.config.value, dlt.secrets.value, dlt.config.value) assert _tup[0] == "REST" assert _tup[1] == "2137" assert isinstance(_tup[2], dict) diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 6cb76fba9d..e97fac8a9e 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -62,14 +62,14 @@ def ie_storage(request) -> SchemaStorage: ) -def init_storage(cls, C: SchemaStorageConfiguration) -> SchemaStorage: +def init_storage(cls, config: SchemaStorageConfiguration) -> SchemaStorage: # use live schema storage for test which must be backward compatible with schema storage - s = cls(C, makedirs=True) - assert C is s.config - if C.export_schema_path: - os.makedirs(C.export_schema_path, exist_ok=True) - if C.import_schema_path: - os.makedirs(C.import_schema_path, exist_ok=True) + s = cls(config, makedirs=True) + assert config is s.config + if config.export_schema_path: + os.makedirs(config.export_schema_path, exist_ok=True) + if config.import_schema_path: + os.makedirs(config.import_schema_path, exist_ok=True) return s diff --git a/tests/common/test_typing.py b/tests/common/test_typing.py index cc319619e6..3a9e320040 100644 --- a/tests/common/test_typing.py +++ b/tests/common/test_typing.py @@ -29,11 +29,13 @@ StrAny, extract_inner_type, extract_union_types, + get_all_types_of_class_in_union, is_dict_generic_type, is_list_generic_type, is_literal_type, is_newtype_type, is_optional_type, + is_subclass, is_typeddict, is_union_type, is_annotated, @@ -138,6 +140,9 @@ def test_is_literal() -> None: assert is_literal_type(Final[TTestLi]) is True # type: ignore[arg-type] assert is_literal_type("a") is False # type: ignore[arg-type] assert is_literal_type(List[str]) is False + NT1 = NewType("NT1", Optional[TTestLi]) # type: ignore[valid-newtype] + assert is_literal_type(NT1) is True + assert is_literal_type(NewType("NT2", NT1)) is True def test_optional() -> None: @@ -151,6 +156,11 @@ def test_optional() -> None: assert is_optional_type(Final[Annotated[Union[str, int], None]]) is False # type: ignore[arg-type] assert is_optional_type(Annotated[Union[str, int], type(None)]) is False # type: ignore[arg-type] assert is_optional_type(TOptionalTyDi) is True # type: ignore[arg-type] + NT1 = NewType("NT1", Optional[str]) # type: ignore[valid-newtype] + assert is_optional_type(NT1) is True + assert is_optional_type(ClassVar[NT1]) is True # type: ignore[arg-type] + assert is_optional_type(NewType("NT2", NT1)) is True + assert is_optional_type(NewType("NT2", Annotated[NT1, 1])) is True assert is_optional_type(TTestTyDi) is False assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] # type: ignore[arg-type] assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] # type: ignore[arg-type] @@ -173,6 +183,7 @@ def test_is_newtype() -> None: assert is_newtype_type(ClassVar[NT1]) is True # type: ignore[arg-type] assert is_newtype_type(TypeVar("TV1", bound=str)) is False # type: ignore[arg-type] assert is_newtype_type(1) is False # type: ignore[arg-type] + assert is_newtype_type(Optional[NT1]) is True # type: ignore[arg-type] def test_is_annotated() -> None: @@ -195,6 +206,7 @@ def test_extract_inner_type() -> None: assert extract_inner_type(NTL2, preserve_new_types=True) is NTL2 l_2 = Literal[NTL2(1.238), NTL2(2.343)] # type: ignore[valid-type] assert extract_inner_type(l_2) is float # type: ignore[arg-type] + assert extract_inner_type(NewType("NT1", Optional[str])) is str def test_get_config_if_union() -> None: @@ -223,3 +235,38 @@ def test_extract_annotated_inner_type() -> None: assert extract_inner_type(Annotated[Optional[MyDataclass], "meta"]) is MyDataclass # type: ignore[arg-type] assert extract_inner_type(Annotated[MyDataclass, Optional]) is MyDataclass # type: ignore[arg-type] assert extract_inner_type(Annotated[MyDataclass, "random metadata string"]) is MyDataclass # type: ignore[arg-type] + + +def test_is_subclass() -> None: + from dlt.extract import Incremental + + assert is_subclass(Incremental, BaseConfiguration) is True + assert is_subclass(Incremental[float], Incremental[int]) is True + assert is_subclass(BaseConfiguration, Incremental[int]) is False + assert is_subclass(list, Sequence) is True + assert is_subclass(list, Sequence[str]) is True + # unions, new types, literals etc. will always produce False + assert is_subclass(list, Optional[list]) is False + assert is_subclass(Optional[list], list) is False + assert is_subclass(list, TTestLi) is False + assert is_subclass(TTestLi, TTestLi) is False + assert is_subclass(list, NewType("LT", list)) is False + + +def test_get_all_types_of_class_in_union() -> None: + from dlt.extract import Incremental + + # optional is an union + assert get_all_types_of_class_in_union(Optional[str], str) == [str] + # both classes and type aliases are recognized + assert get_all_types_of_class_in_union(Optional[Incremental], BaseConfiguration) == [ + Incremental + ] + assert get_all_types_of_class_in_union(Optional[Incremental[float]], BaseConfiguration) == [ + Incremental[float] + ] + # by default superclasses are not recognized + assert get_all_types_of_class_in_union(Union[BaseConfiguration, str], Incremental[float]) == [] + assert get_all_types_of_class_in_union( + Union[BaseConfiguration, str], Incremental[float], with_superclass=True + ) == [BaseConfiguration] diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index 229ce17085..e08c1cdf01 100644 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -1,12 +1,14 @@ +from copy import deepcopy import itertools import inspect import binascii import pytest -from typing import Dict +from typing import Any, Dict from dlt.common.exceptions import PipelineException, TerminalValueError from dlt.common.runners import Venv from dlt.common.utils import ( + clone_dict_nested, graph_find_scc_nodes, flatten_list_of_str_or_dicts, digest128, @@ -293,6 +295,75 @@ def test_nested_dict_merge() -> None: assert update_dict_nested(dict(dict_1), dict_2) == {"a": 2, "b": 2, "c": 4} assert update_dict_nested(dict(dict_2), dict_1) == {"a": 1, "b": 2, "c": 4} - assert update_dict_nested(dict(dict_1), dict_2, keep_dst_values=True) == update_dict_nested( - dict_2, dict_1 + assert update_dict_nested(dict(dict_1), dict_2, copy_src_dicts=True) == {"a": 2, "b": 2, "c": 4} + assert update_dict_nested(dict(dict_2), dict_1, copy_src_dicts=True) == {"a": 1, "b": 2, "c": 4} + dict_1_update = update_dict_nested({}, dict_1) + assert dict_1_update == dict_1 + assert dict_1_update is not dict_1 + dict_1_update = clone_dict_nested(dict_1) + assert dict_1_update == dict_1 + assert dict_1_update is not dict_1 + + dict_1_deep = {"a": 3, "b": dict_1} + dict_1_deep_clone = update_dict_nested({}, dict_1_deep) + assert dict_1_deep_clone == dict_1_deep + # reference got copied + assert dict_1_deep_clone["b"] is dict_1 + # update with copy + dict_1_deep_clone = clone_dict_nested(dict_1_deep) + assert dict_1_deep_clone == dict_1_deep + # reference got copied + assert dict_1_deep_clone["b"] is not dict_1 + + # make sure that that Mappings that are not dicts are atomically copied + from dlt.common.configuration.specs import ConnectionStringCredentials + + dsn = ConnectionStringCredentials("postgres://loader:loader@localhost:5432/dlt_data") + dict_1_mappings: Dict[str, Any] = { + "_tuple": (1, 2), + "_config": {"key": "str", "_dsn": dsn, "_dict": dict_1_deep}, + } + # make a clone + dict_1_mappings_clone = clone_dict_nested(dict_1_mappings) + # values are same + assert dict_1_mappings == dict_1_mappings_clone + # all objects and mappings are copied as reference + assert dict_1_mappings["_tuple"] is dict_1_mappings_clone["_tuple"] + assert dict_1_mappings["_config"]["_dsn"] is dict_1_mappings_clone["_config"]["_dsn"] + # dicts are copied by value + assert dict_1_mappings["_config"] is not dict_1_mappings_clone["_config"] + assert dict_1_mappings["_config"]["_dict"] is not dict_1_mappings_clone["_config"]["_dict"] + assert ( + dict_1_mappings["_config"]["_dict"]["b"] + is not dict_1_mappings_clone["_config"]["_dict"]["b"] + ) + + # make a copy using references + dict_1_mappings_clone = update_dict_nested({}, dict_1_mappings) + assert dict_1_mappings["_config"] is dict_1_mappings_clone["_config"] + assert dict_1_mappings["_config"]["_dict"] is dict_1_mappings_clone["_config"]["_dict"] + assert ( + dict_1_mappings["_config"]["_dict"]["b"] is dict_1_mappings_clone["_config"]["_dict"]["b"] + ) + + # replace a few keys + print(dict_1_mappings) + # this should be non destructive for the dst + deep_clone_dict_1_mappings = deepcopy(dict_1_mappings) + mappings_update = update_dict_nested( + dict_1_mappings, {"_config": {"_dsn": ConnectionStringCredentials(), "_dict": {"a": "X"}}} + ) + # assert deep_clone_dict_1_mappings == dict_1_mappings + # things overwritten + assert dict_1_mappings["_config"]["_dsn"] is mappings_update["_config"]["_dsn"] + # this one is empty + assert mappings_update["_config"]["_dsn"].username is None + assert dict_1_mappings["_config"]["_dsn"].username is None + assert mappings_update["_config"]["_dict"]["a"] == "X" + assert dict_1_mappings["_config"]["_dict"]["a"] == "X" + + # restore original values + mappings_update = update_dict_nested( + mappings_update, {"_config": {"_dsn": dsn, "_dict": {"a": 3}}} ) + assert mappings_update == deep_clone_dict_1_mappings diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index c6a675a8d3..1cf14abe55 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -57,8 +57,10 @@ def resource(): "resource": "resource", "write_disposition": "append", } - assert resource().name == "resource" assert resource._args_bound is False + assert resource.name == "resource" + assert resource().args_bound is True + assert resource().name == "resource" assert resource.incremental is None assert resource.write_disposition == "append" @@ -630,6 +632,18 @@ def inner_resource(initial_id=dlt.config.value): assert "secret" in fields assert "config" in fields + @dlt.resource(standalone=True) + def inner_standalone_resource( + secret=dlt.secrets.value, config=dlt.config.value, opt: str = "A" + ): + yield 1 + + SPEC = get_fun_spec(inner_standalone_resource("TS", "CFG")._pipe.gen) # type: ignore[arg-type] + fields = SPEC.get_resolvable_fields() + # resources marked as standalone always inject full signature + assert len(fields) == 3 + assert {"secret", "config", "opt"} == set(fields.keys()) + @dlt.source def inner_source(secret=dlt.secrets.value, config=dlt.config.value, opt: str = "A"): return standalone_resource @@ -734,6 +748,11 @@ def standalone_signature(init: int, secret_end: int = dlt.secrets.value): yield from range(init, secret_end) +@dlt.resource +def regular_signature(init: int, secret_end: int = dlt.secrets.value): + yield from range(init, secret_end) + + def test_standalone_resource() -> None: # wrapped flag will not create the resource but just simple function wrapper that must be called before use @dlt.resource(standalone=True) @@ -746,7 +765,7 @@ def nice_signature(init: int): assert nice_signature.__doc__ == """Has nice signature""" assert list(nice_signature(7)) == [7, 8, 9] - assert nice_signature(8)._args_bound is True + assert nice_signature(8).args_bound is True with pytest.raises(TypeError): # bound! nice_signature(7)() @@ -800,7 +819,7 @@ def test_standalone_transformer() -> None: bound_tx = standalone_transformer(5, 10) # this is not really true - assert bound_tx._args_bound is True + assert bound_tx.args_bound is True with pytest.raises(TypeError): bound_tx(1) assert isinstance(bound_tx, DltResource) @@ -891,16 +910,28 @@ def rv_resource(uniq_name: str = dlt.config.value): assert conf_ex.value.fields == ["uniq_name"] -def test_resource_rename_credentials_separation(): +def test_standalone_resource_rename_credentials_separation(): os.environ["SOURCES__TEST_DECORATORS__STANDALONE_SIGNATURE__SECRET_END"] = "5" assert list(standalone_signature(1)) == [1, 2, 3, 4] - # config section is not impacted by the rename - # NOTE: probably we should keep it like that - os.environ["SOURCES__TEST_DECORATORS__RENAMED_SIG__SECRET_END"] = "6" + # os.environ["SOURCES__TEST_DECORATORS__RENAMED_SIG__SECRET_END"] = "6" + # assert list(standalone_signature.with_name("renamed_sig")(1)) == [1, 2, 3, 4, 5] + + # bound resource will not allow for reconfig assert list(standalone_signature(1).with_name("renamed_sig")) == [1, 2, 3, 4] +def test_resource_rename_credentials_separation(): + os.environ["SOURCES__TEST_DECORATORS__REGULAR_SIGNATURE__SECRET_END"] = "5" + assert list(regular_signature(1)) == [1, 2, 3, 4] + + os.environ["SOURCES__TEST_DECORATORS__RENAMED_SIG__SECRET_END"] = "6" + assert list(regular_signature.with_name("renamed_sig")(1)) == [1, 2, 3, 4, 5] + + # bound resource will not allow for reconfig + assert list(regular_signature(1).with_name("renamed_sig")) == [1, 2, 3, 4] + + def test_class_source() -> None: class _Source: def __init__(self, elems: int) -> None: diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 9620e7fdfb..dc978b997a 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -317,7 +317,7 @@ def tx_step(item): def expect_tables(extract_step: Extract, resource: DltResource) -> dlt.Schema: source = DltSource(dlt.Schema("selectables"), "module", [resource(10)]) load_id = extract_step.extract_storage.create_load_package(source.discover_schema()) - extract_step._extract_single_source(load_id, source) + extract_step._extract_single_source(load_id, source, max_parallel_items=5, workers=1) # odd and even tables must be in the source schema assert len(source.schema.data_tables(include_incomplete=True)) == 2 assert "odd_table" in source.schema._schema_tables @@ -340,7 +340,7 @@ def expect_tables(extract_step: Extract, resource: DltResource) -> dlt.Schema: source = source.with_resources(resource.name) source.selected_resources[resource.name].bind(10).select_tables("odd_table") load_id = extract_step.extract_storage.create_load_package(source.discover_schema()) - extract_step._extract_single_source(load_id, source) + extract_step._extract_single_source(load_id, source, max_parallel_items=5, workers=1) assert len(source.schema.data_tables(include_incomplete=True)) == 1 assert "odd_table" in source.schema._schema_tables extract_step.extract_storage.commit_new_load_package(load_id, source.schema) diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 7fb9c39194..675f44bb14 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -1,5 +1,6 @@ import os import asyncio +import inspect import random from time import sleep from typing import Optional, Any @@ -12,6 +13,7 @@ import dlt from dlt.common.configuration.container import Container +from dlt.common.configuration.exceptions import InvalidNativeValue from dlt.common.configuration.specs.base_configuration import configspec, BaseConfiguration from dlt.common.configuration import ConfigurationValueError from dlt.common.pendulum import pendulum, timedelta @@ -24,6 +26,7 @@ from dlt.extract.exceptions import InvalidStepFunctionArguments from dlt.extract.resource import DltResource from dlt.sources.helpers.transform import take_first +from dlt.extract.incremental import IncrementalResourceWrapper, Incremental from dlt.extract.incremental.exceptions import ( IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing, @@ -39,6 +42,45 @@ ) +def test_detect_incremental_arg() -> None: + def incr_1(incremental: dlt.sources.incremental): # type: ignore[type-arg] + pass + + assert ( + IncrementalResourceWrapper.get_incremental_arg(inspect.signature(incr_1)).name + == "incremental" + ) + + def incr_2(incremental: Incremental[str]): + pass + + assert ( + IncrementalResourceWrapper.get_incremental_arg(inspect.signature(incr_2)).name + == "incremental" + ) + + def incr_3(incremental=dlt.sources.incremental[str]("updated_at")): # noqa + pass + + assert ( + IncrementalResourceWrapper.get_incremental_arg(inspect.signature(incr_3)).name + == "incremental" + ) + + def incr_4(incremental=Incremental[str]("updated_at")): # noqa + pass + + assert ( + IncrementalResourceWrapper.get_incremental_arg(inspect.signature(incr_4)).name + == "incremental" + ) + + def incr_5(incremental: IncrementalResourceWrapper): + pass + + assert IncrementalResourceWrapper.get_incremental_arg(inspect.signature(incr_5)) is None + + @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_single_items_last_value_state_is_updated(item_type: TestDataItemFormat) -> None: data = [ @@ -332,7 +374,7 @@ def test_optional_arg_from_spec_not_passed(item_type: TestDataItemFormat) -> Non @configspec class SomeDataOverrideConfiguration(BaseConfiguration): - created_at: dlt.sources.incremental = dlt.sources.incremental("created_at", initial_value="2022-02-03T00:00:00Z") # type: ignore[type-arg] + created_at: dlt.sources.incremental = dlt.sources.incremental("updated_at", initial_value="2022-02-03T00:00:00Z") # type: ignore[type-arg] # provide what to inject via spec. the spec contain the default @@ -351,6 +393,7 @@ def some_data_override_config( def test_override_initial_value_from_config(item_type: TestDataItemFormat) -> None: # use the shortest possible config version # os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_OVERRIDE_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' + os.environ["CREATED_AT__CURSOR_PATH"] = "created_at" os.environ["CREATED_AT__INITIAL_VALUE"] = "2000-02-03T00:00:00Z" p = dlt.pipeline(pipeline_name=uniq_id()) @@ -598,7 +641,7 @@ def some_data(last_timestamp=dlt.sources.incremental("item.timestamp|modifiedAt" def test_remove_incremental_with_explicit_none() -> None: @dlt.resource(standalone=True) def some_data( - last_timestamp: dlt.sources.incremental[float] = dlt.sources.incremental( + last_timestamp: Optional[dlt.sources.incremental[float]] = dlt.sources.incremental( "id", initial_value=9 ), ): @@ -623,9 +666,9 @@ def some_data_optional( assert last_timestamp is None yield 1 - # we disable incremental by typing the argument as optional - # if not disabled it would fail on "item.timestamp" not found - assert list(some_data_optional(last_timestamp=dlt.sources.incremental.EMPTY)) == [1] + # can't use EMPTY to reset incremental + with pytest.raises(ValueError): + list(some_data_optional(last_timestamp=dlt.sources.incremental.EMPTY)) @dlt.resource(standalone=True) def some_data( @@ -635,8 +678,8 @@ def some_data( yield 1 # we'll get the value error - with pytest.raises(ValueError): - assert list(some_data(last_timestamp=dlt.sources.incremental.EMPTY)) == [1] + with pytest.raises(InvalidNativeValue): + list(some_data(last_timestamp=dlt.sources.incremental.EMPTY)) @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) @@ -860,20 +903,23 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_apply_hints_incremental(item_type: TestDataItemFormat) -> None: - p = dlt.pipeline(pipeline_name=uniq_id()) + p = dlt.pipeline(pipeline_name=uniq_id(), destination="dummy") data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] source_items = data_to_item_format(item_type, data) @dlt.resource def some_data(created_at: Optional[dlt.sources.incremental[int]] = None): # make sure that incremental from apply_hints is here - assert created_at is not None - assert created_at.last_value_func is max + if created_at is not None: + assert created_at.cursor_path == "created_at" + assert created_at.last_value_func is max yield source_items # the incremental wrapper is created for a resource and the incremental value is provided via apply hints r = some_data() assert r is not some_data + assert r.incremental is not None + assert r.incremental.incremental is None r.apply_hints(incremental=dlt.sources.incremental("created_at", last_value_func=max)) if item_type == "pandas": assert list(r)[0].equals(source_items[0]) @@ -881,6 +927,9 @@ def some_data(created_at: Optional[dlt.sources.incremental[int]] = None): assert list(r) == source_items p.extract(r) assert "incremental" in r.state + assert r.incremental.incremental is not None + assert len(r._pipe) == 2 + # no more elements assert list(r) == [] # same thing with explicit None @@ -894,11 +943,20 @@ def some_data(created_at: Optional[dlt.sources.incremental[int]] = None): assert "incremental" in r.state assert list(r) == [] + # remove incremental + r.apply_hints(incremental=dlt.sources.incremental.EMPTY) + assert r.incremental is not None + assert r.incremental.incremental is None + if item_type == "pandas": + assert list(r)[0].equals(source_items[0]) + else: + assert list(r) == source_items + # as above but we provide explicit incremental when creating resource p = p.drop() - r = some_data(created_at=dlt.sources.incremental("created_at", last_value_func=max)) - # explicit has precedence here and hints will be ignored - r.apply_hints(incremental=dlt.sources.incremental("created_at", last_value_func=min)) + r = some_data(created_at=dlt.sources.incremental("created_at", last_value_func=min)) + # hints have precedence, as expected + r.apply_hints(incremental=dlt.sources.incremental("created_at", last_value_func=max)) p.extract(r) assert "incremental" in r.state # max value @@ -927,10 +985,128 @@ def some_data_no_incremental(): # we add incremental as a step p = p.drop() r = some_data_no_incremental() - r.apply_hints(incremental=dlt.sources.incremental("created_at", last_value_func=max)) - assert r.incremental is not None + print(r._pipe) + incr_instance = dlt.sources.incremental("created_at", last_value_func=max) + r.apply_hints(incremental=incr_instance) + print(r._pipe) + assert r.incremental is incr_instance p.extract(r) assert "incremental" in r.state + info = p.normalize() + assert info.row_counts["some_data_no_incremental"] == 3 + # make sure we can override incremental + incr_instance = dlt.sources.incremental("created_at", last_value_func=max, row_order="desc") + r.apply_hints(incremental=incr_instance) + assert r.incremental is incr_instance + p.extract(r) + info = p.normalize() + assert "some_data_no_incremental" not in info.row_counts + # we switch last value func to min + incr_instance = dlt.sources.incremental( + "created_at", last_value_func=min, row_order="desc", primary_key=() + ) + r.apply_hints(incremental=incr_instance) + assert r.incremental is incr_instance + p.extract(r) + info = p.normalize() + # we have three elements due to min function (equal element NOT is eliminated due to primary_key==()) + assert info.row_counts["some_data_no_incremental"] == 3 + + # remove incremental + r.apply_hints(incremental=dlt.sources.incremental.EMPTY) + assert r.incremental is None + + +def test_incremental_wrapper_on_clone_standalone_incremental() -> None: + @dlt.resource(standalone=True) + def standalone_incremental(created_at: Optional[dlt.sources.incremental[int]] = None): + yield [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] + + s_r_1 = standalone_incremental() + s_r_i_1 = dlt.sources.incremental[int]("created_at") + s_r_2 = standalone_incremental() + s_r_i_2 = dlt.sources.incremental[int]("created_at", initial_value=3) + s_r_i_3 = dlt.sources.incremental[int]("created_at", initial_value=1, last_value_func=min) + s_r_3 = standalone_incremental(created_at=s_r_i_3) + + # different wrappers + assert s_r_1.incremental is not s_r_2.incremental + s_r_1.apply_hints(incremental=s_r_i_1) + s_r_2.apply_hints(incremental=s_r_i_2) + assert s_r_1.incremental.incremental is s_r_i_1 + assert s_r_2.incremental.incremental is s_r_i_2 + + # evaluate s r 3 + assert list(s_r_3) == [{"created_at": 1}] + # incremental is set after evaluation but the instance is different (wrapper is merging instances) + assert s_r_3.incremental.incremental.last_value_func is min + + # standalone resources are bound so clone does not re-wrap + s_r_3_clone = s_r_3._clone() + assert s_r_3_clone.incremental is s_r_3.incremental + assert s_r_3_clone.incremental.incremental is s_r_3.incremental.incremental + + # evaluate others + assert len(list(s_r_1)) == 3 + assert len(list(s_r_2)) == 1 + + +def test_incremental_wrapper_on_clone_standalone_no_incremental() -> None: + @dlt.resource(standalone=True) + def standalone(): + yield [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] + + s_r_1 = standalone() + s_r_i_1 = dlt.sources.incremental[int]("created_at", row_order="desc") + s_r_2 = standalone() + s_r_i_2 = dlt.sources.incremental[int]("created_at", initial_value=3) + + # clone keeps the incremental step + s_r_1.apply_hints(incremental=s_r_i_1) + assert s_r_1.incremental is s_r_i_1 + + s_r_1_clone = s_r_1._clone() + assert s_r_1_clone.incremental is s_r_i_1 + + assert len(list(s_r_1)) == 3 + s_r_2.apply_hints(incremental=s_r_i_2) + assert len(list(s_r_2)) == 1 + + +def test_incremental_wrapper_on_clone_incremental() -> None: + @dlt.resource + def regular_incremental(created_at: Optional[dlt.sources.incremental[int]] = None): + yield [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] + + assert regular_incremental.incremental is not None + assert regular_incremental.incremental.incremental is None + + # separate incremental + r_1 = regular_incremental() + assert r_1.args_bound is True + r_2 = regular_incremental.with_name("cloned_regular") + assert r_1.incremental is not None + assert r_2.incremental is not None + assert r_1.incremental is not r_2.incremental is not regular_incremental.incremental + + # evaluate and compare incrementals + assert len(list(r_1)) == 3 + assert len(list(r_2)) == 3 + assert r_1.incremental.incremental is None + assert r_2.incremental.incremental is None + + # now bind some real incrementals + r_3 = regular_incremental(dlt.sources.incremental[int]("created_at", initial_value=3)) + r_4 = regular_incremental( + dlt.sources.incremental[int]("created_at", initial_value=1, last_value_func=min) + ) + r_4_clone = r_4._clone("r_4_clone") + # evaluate + assert len(list(r_3)) == 1 + assert len(list(r_4)) == 1 + assert r_3.incremental.incremental is not r_4.incremental.incremental + # now the clone should share the incremental because it was done after parameters were bound + assert r_4_clone.incremental is r_4.incremental def test_last_value_func_on_dict() -> None: @@ -991,6 +1167,7 @@ def some_data( max_hours: int = 2, tz: str = None, ): + print("some_data", updated_at, dict(updated_at)) data = [ {"updated_at": start_dt + timedelta(hours=hour), "hour": hour} for hour in range(1, max_hours + 1) @@ -1034,6 +1211,8 @@ def some_data( "updated_at", initial_value=pendulum_start_dt, end_value=pendulum_start_dt.add(hours=3) ) ) + print(resource.incremental.incremental, dict(resource.incremental.incremental)) + pipeline = pipeline.drop() extract_info = pipeline.extract(resource) assert ( extract_info.metrics[extract_info.loads_ids[0]][0]["resource_metrics"][ @@ -1621,7 +1800,7 @@ def test_type( r = test_type() list(r) - assert r.incremental._incremental.get_incremental_value_type() is str + assert r.incremental.incremental.get_incremental_value_type() is str # use annotation @dlt.resource @@ -1635,7 +1814,7 @@ def test_type_2( r = test_type_2() list(r) - assert r.incremental._incremental.get_incremental_value_type() is int + assert r.incremental.incremental.get_incremental_value_type() is int # pass in explicit value @dlt.resource @@ -1645,7 +1824,7 @@ def test_type_3(updated_at: dlt.sources.incremental[int]): r = test_type_3(dlt.sources.incremental[float]("updated_at", allow_external_schedulers=True)) list(r) - assert r.incremental._incremental.get_incremental_value_type() is float + assert r.incremental.incremental.get_incremental_value_type() is float # pass explicit value overriding default that is typed @dlt.resource @@ -1657,7 +1836,7 @@ def test_type_4( r = test_type_4(dlt.sources.incremental[str]("updated_at", allow_external_schedulers=True)) list(r) - assert r.incremental._incremental.get_incremental_value_type() is str + assert r.incremental.incremental.get_incremental_value_type() is str # no generic type information @dlt.resource @@ -1669,7 +1848,7 @@ def test_type_5( r = test_type_5(dlt.sources.incremental("updated_at")) list(r) - assert r.incremental._incremental.get_incremental_value_type() is Any + assert r.incremental.incremental.get_incremental_value_type() is Any @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) @@ -1743,27 +1922,39 @@ def test_type_2( # does not participate os.environ["DLT_START_VALUE"] = "2" - result = data_item_to_list(item_type, list(test_type_2())) - assert len(result) == 3 - - assert test_type_2.incremental.allow_external_schedulers is False - assert test_type_2().incremental.allow_external_schedulers is False - - # allow scheduler in wrapper - r = test_type_2() - r.incremental.allow_external_schedulers = True - result = data_item_to_list(item_type, list(test_type_2())) - assert len(result) == 2 - assert r.incremental.allow_external_schedulers is True - assert r.incremental._incremental.allow_external_schedulers is True + # r = test_type_2() + # result = data_item_to_list(item_type, list(r)) + # assert len(result) == 3 + + # # incremental not bound to the wrapper + # assert test_type_2.incremental.allow_external_schedulers is None + # assert test_type_2().incremental.allow_external_schedulers is None + # # this one is bound + # assert r.incremental.allow_external_schedulers is False + + # # allow scheduler in wrapper + # r = test_type_2() + # r.incremental.allow_external_schedulers = True + # result = data_item_to_list(item_type, list(r)) + # assert len(result) == 2 + # assert r.incremental.allow_external_schedulers is True + # assert r.incremental.incremental.allow_external_schedulers is True # add incremental dynamically @dlt.resource() def test_type_3(): - yield [{"updated_at": d} for d in [1, 2, 3]] + data = [{"updated_at": d} for d in [1, 2, 3]] + yield data_to_item_format(item_type, data) r = test_type_3() - r.add_step(dlt.sources.incremental("updated_at")) + r.add_step(dlt.sources.incremental[int]("updated_at")) r.incremental.allow_external_schedulers = True - result = data_item_to_list(item_type, list(test_type_2())) + result = data_item_to_list(item_type, list(r)) assert len(result) == 2 + + # if type of incremental cannot be inferred, external scheduler will be ignored + r = test_type_3() + r.add_step(dlt.sources.incremental("updated_at")) + r.incremental.allow_external_schedulers = True + result = data_item_to_list(item_type, list(r)) + assert len(result) == 3 diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index cd6ee4c3d5..e40b03219d 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -677,8 +677,8 @@ def test_illegal_double_bind() -> None: def _r1(): yield ["a", "b", "c"] - assert _r1._args_bound is False - assert _r1()._args_bound is True + assert _r1.args_bound is False + assert _r1().args_bound is True with pytest.raises(TypeError) as py_ex: _r1()() @@ -689,14 +689,14 @@ def _r1(): assert "Parametrized resource" in str(py_ex.value) bound_r = dlt.resource([1, 2, 3], name="rx") - assert bound_r._args_bound is True + assert bound_r.args_bound is True with pytest.raises(TypeError): _r1() def _gen(): yield from [1, 2, 3] - assert dlt.resource(_gen())._args_bound is True + assert dlt.resource(_gen()).args_bound is True @dlt.resource @@ -1293,7 +1293,7 @@ def empty_gen(): ) assert empty_r._hints == { "columns": {}, - "incremental": None, + "incremental": Incremental.EMPTY, "validator": None, "write_disposition": "append", "original_columns": {}, diff --git a/tests/helpers/streamlit_tests/__init__.py b/tests/helpers/streamlit_tests/__init__.py index e69de29bb2..61132f214d 100644 --- a/tests/helpers/streamlit_tests/__init__.py +++ b/tests/helpers/streamlit_tests/__init__.py @@ -0,0 +1,3 @@ +import pytest + +pytest.importorskip("streamlit") From e78a3c1aab058f4d7ef7f13ba829d92a697cd0a3 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Wed, 5 Jun 2024 19:43:24 +0200 Subject: [PATCH 10/61] Example: fast postgres to postgres (#1428) * add example * move postgres to postgres example to separate folder, change credentials setting * deleted unused functions, uncomment norm and loading * fix bugs, reformat, rename vars * delete old file, fix typing * make title shorter * fix type for load_type * ignore load_type type * add tests * fix typing * fix dateime * add deps * update lock file * merge and update lock file * install duckdb extensions * fix install duckdb extensions * fix install duckdb extensions * fix install duckdb extensions * fix version of duckdb for extensions * fix path to duckdb dump * update duckdb * update duckdb in makefile * delete manual duckdb extension installation --------- Co-authored-by: sspaeti --- .../.dlt/example.secrets.toml | 15 + .../examples/postgres_to_postgres/__init__.py | 0 .../postgres_to_postgres.py | 294 ++++++++++++++++++ poetry.lock | 182 ++++++++--- pyproject.toml | 3 + 5 files changed, 453 insertions(+), 41 deletions(-) create mode 100644 docs/examples/postgres_to_postgres/.dlt/example.secrets.toml create mode 100644 docs/examples/postgres_to_postgres/__init__.py create mode 100644 docs/examples/postgres_to_postgres/postgres_to_postgres.py diff --git a/docs/examples/postgres_to_postgres/.dlt/example.secrets.toml b/docs/examples/postgres_to_postgres/.dlt/example.secrets.toml new file mode 100644 index 0000000000..cab6abedb6 --- /dev/null +++ b/docs/examples/postgres_to_postgres/.dlt/example.secrets.toml @@ -0,0 +1,15 @@ +[destination.postgres.credentials] +host = "" +database = "" +username = "" +password = "" +port = "" +connection_timeout = 15 + +[sources.postgres.credentials] +host = "" +database = "" +username = "" +password = "" +port = "" +chunk_size = 1000000 diff --git a/docs/examples/postgres_to_postgres/__init__.py b/docs/examples/postgres_to_postgres/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/postgres_to_postgres/postgres_to_postgres.py b/docs/examples/postgres_to_postgres/postgres_to_postgres.py new file mode 100644 index 0000000000..85b8aed045 --- /dev/null +++ b/docs/examples/postgres_to_postgres/postgres_to_postgres.py @@ -0,0 +1,294 @@ +""" +--- +title: Load from Postgres to Postgres faster +description: Load data fast from Postgres to Postgres with ConnectorX & Arrow export as Parquet, normalizing and exporting as DuckDB, and attaching it to Postgres for bigger Postgres tables (GBs) +keywords: [connector x, pyarrow, zero copy, duckdb, postgres, initial load] +--- + +:::info +Huge shout out to [Simon Späti](https://github.com/sspaeti) for this example! +::: + +This examples shows you how to export and import data from Postgres to Postgres in a fast way with ConnectorX and DuckDB +since the default export will generate `Insert_statement` during the normalization phase, which is super slow for large tables. + +As it's an initial load, we create a separate schema with timestamp initially and then replace the existing schema with the new one. + +:::note +This approach is tested and works well for an initial load (`--replace`), however, the incremental load (`--merge`) might need some adjustments (loading of load-tables of dlt, setting up first run after an initial +load, etc.). +::: + +We'll learn: + +- How to get arrow tables from [connector X](https://github.com/sfu-db/connector-x) and yield them in chunks. +- That merge and incremental loads work with arrow tables. +- How to use DuckDB for a speedy normalization. +- How to use `argparse` to turn your pipeline script into a CLI. +- How to work with `ConnectionStringCredentials` spec. + + +Be aware that you need to define the database credentials in `.dlt/secrets.toml` or dlt ENVs and adjust the tables names ("table_1" and "table_2"). + +Install `dlt` with `duckdb` as extra, also `connectorx`, Postgres adapter and progress bar tool: + +```sh +pip install dlt[duckdb] connectorx pyarrow psycopg2-binary alive-progress +``` + +Run the example: +```sh +python postgres_to_postgres.py --replace +``` + +:::warn +Attention: There were problems with data type TIME that includes nano seconds. More details in +[Slack](https://dlthub-community.slack.com/archives/C04DQA7JJN6/p1711579390028279?thread_ts=1711477727.553279&cid=C04DQA7JJN60) + +As well as with installing DuckDB extension (see [issue +here](https://github.com/duckdb/duckdb/issues/8035#issuecomment-2020803032)), that's why I manually installed the `postgres_scanner.duckdb_extension` in my Dockerfile to load the data into Postgres. +::: +""" + +import argparse +import os +from dlt.common import pendulum +from typing import List + +import connectorx as cx +import duckdb +import psycopg2 + +import dlt +from dlt.sources.credentials import ConnectionStringCredentials + +CHUNKSIZE = int( + os.getenv("CHUNKSIZE", 1000000) +) # 1 mio rows works well with 1GiB RAM memory (if no parallelism) + + +def read_sql_x_chunked(conn_str: str, query: str, chunk_size: int = CHUNKSIZE): + offset = 0 + while True: + chunk_query = f"{query} LIMIT {chunk_size} OFFSET {offset}" + data_chunk = cx.read_sql( + conn_str, + chunk_query, + return_type="arrow2", + protocol="binary", + ) + yield data_chunk + if data_chunk.num_rows < chunk_size: + break # No more data to read + offset += chunk_size + + +@dlt.source(max_table_nesting=0) +def pg_resource_chunked( + table_name: str, + primary_key: List[str], + schema_name: str, + order_date: str, + load_type: str = "merge", + columns: str = "*", + credentials: ConnectionStringCredentials = dlt.secrets[ + "sources.postgres.credentials" + ], +): + print( + f"dlt.resource write_disposition: `{load_type}` -- ", + f"connection string: postgresql://{credentials.username}:*****@{credentials.host}:{credentials.host}/{credentials.database}", + ) + + query = f"SELECT {columns} FROM {schema_name}.{table_name} ORDER BY {order_date}" # Needed to have an idempotent query + + source = dlt.resource( # type: ignore + name=table_name, + table_name=table_name, + write_disposition=load_type, # use `replace` for initial load, `merge` for incremental + primary_key=primary_key, + standalone=True, + parallelized=True, + )(read_sql_x_chunked)( + credentials.to_native_representation(), # Pass the connection string directly + query, + ) + + if load_type == "merge": + # Retrieve the last value processed for incremental loading + source.apply_hints(incremental=dlt.sources.incremental(order_date)) + + return source + + +def table_desc(table_name, pk, schema_name, order_date, columns="*"): + return { + "table_name": table_name, + "pk": pk, + "schema_name": schema_name, + "order_date": order_date, + "columns": columns, + } + + +if __name__ == "__main__": + # Input Handling + parser = argparse.ArgumentParser( + description="Run specific functions in the script." + ) + parser.add_argument("--replace", action="store_true", help="Run initial load") + parser.add_argument("--merge", action="store_true", help="Run delta load") + args = parser.parse_args() + + source_schema_name = "example_data_1" + target_schema_name = "example_data_2" + pipeline_name = "loading_postgres_to_postgres" + + tables = [ + table_desc("table_1", ["pk"], source_schema_name, "updated_at"), + table_desc("table_2", ["pk"], source_schema_name, "updated_at"), + ] + + # default is initial loading (replace) + load_type = "merge" if args.merge else "replace" + print(f"LOAD-TYPE: {load_type}") + + resources = [] + for table in tables: + resources.append( + pg_resource_chunked( + table["table_name"], + table["pk"], + table["schema_name"], + table["order_date"], + load_type=load_type, + columns=table["columns"], + ) + ) + + if load_type == "replace": + pipeline = dlt.pipeline( + pipeline_name=pipeline_name, + destination="duckdb", + dataset_name=target_schema_name, + full_refresh=True, + progress="alive_progress", + ) + else: + pipeline = dlt.pipeline( + pipeline_name=pipeline_name, + destination="postgres", + dataset_name=target_schema_name, + full_refresh=False, + ) # full_refresh=False + + # start timer + startTime = pendulum.now() + + # 1. extract + print("##################################### START EXTRACT ########") + pipeline.extract(resources) + print(f"--Time elapsed: {pendulum.now() - startTime}") + + # 2. normalize + print("##################################### START NORMALIZATION ########") + if load_type == "replace": + info = pipeline.normalize( + workers=2, loader_file_format="parquet" + ) # https://dlthub.com/docs/blog/dlt-arrow-loading + else: + info = pipeline.normalize() + + print(info) + print(pipeline.last_trace.last_normalize_info) + print(f"--Time elapsed: {pendulum.now() - startTime}") + + # 3. load + print("##################################### START LOAD ########") + load_info = pipeline.load() + print(load_info) + print(f"--Time elapsed: {pendulum.now() - startTime}") + + # check that stuff was loaded + row_counts = pipeline.last_trace.last_normalize_info.row_counts + assert row_counts["table_1"] == 9 + assert row_counts["table_2"] == 9 + + # make sure nothing failed + load_info.raise_on_failed_jobs() + + if load_type == "replace": + # 4. Load DuckDB local database into Postgres + print("##################################### START DUCKDB LOAD ########") + # connect to local duckdb dump + conn = duckdb.connect(f"{load_info.destination_displayable_credentials}".split(":///")[1]) + conn.sql("INSTALL postgres;") + conn.sql("LOAD postgres;") + # select generated timestamp schema + timestamped_schema = conn.sql( + f"""select distinct table_schema from information_schema.tables + where table_schema like '{target_schema_name}%' + and table_schema NOT LIKE '%_staging' + order by table_schema desc""" + ).fetchone()[0] + print(f"timestamped_schema: {timestamped_schema}") + + target_credentials = ConnectionStringCredentials(dlt.secrets["destination.postgres.credentials"]) + # connect to destination (timestamped schema) + conn.sql( + f"ATTACH 'dbname={target_credentials.database} user={target_credentials.username} password={target_credentials.password} host={target_credentials.host} port={target_credentials.port}' AS pg_db (TYPE postgres);" + ) + conn.sql(f"CREATE SCHEMA IF NOT EXISTS pg_db.{timestamped_schema};") + + for table in tables: + print( + f"LOAD DuckDB -> Postgres: table: {timestamped_schema}.{table['table_name']} TO Postgres {timestamped_schema}.{table['table_name']}" + ) + + conn.sql( + f"CREATE OR REPLACE TABLE pg_db.{timestamped_schema}.{table['table_name']} AS SELECT * FROM {timestamped_schema}.{table['table_name']};" + ) + conn.sql( + f"SELECT count(*) as count FROM pg_db.{timestamped_schema}.{table['table_name']};" + ).show() + + print(f"--Time elapsed: {pendulum.now() - startTime}") + print("##################################### FINISHED ########") + + # check that stuff was loaded + rows = conn.sql( + f"SELECT count(*) as count FROM pg_db.{timestamped_schema}.{table['table_name']};" + ).fetchone()[0] + assert int(rows) == 9 + + # 5. Cleanup and rename Schema + print( + "##################################### RENAME Schema and CLEANUP ########" + ) + try: + con_hd = psycopg2.connect( + dbname=target_credentials.database, + user=target_credentials.username, + password=target_credentials.password, + host=target_credentials.host, + port=target_credentials.port, + ) + con_hd.autocommit = True + print( + "Connected to HD-DB: " + + target_credentials.host + + ", DB: " + + target_credentials.username + ) + except Exception as e: + print(f"Unable to connect to HD-database! The reason: {e}") + + with con_hd.cursor() as cur: + # Drop existing target_schema_name + print(f"Drop existing {target_schema_name}") + cur.execute(f"DROP SCHEMA IF EXISTS {target_schema_name} CASCADE;") + # Rename timestamped-target_schema_name to target_schema_name + print(f"Going to rename schema {timestamped_schema} to {target_schema_name}") + cur.execute(f"ALTER SCHEMA {timestamped_schema} RENAME TO {target_schema_name};") + + con_hd.close() diff --git a/poetry.lock b/poetry.lock index 6159f751c4..b476bc4a9f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "about-time" @@ -3680,6 +3680,106 @@ files = [ {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, + {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, + {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, + {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, + {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, + {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, + {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, + {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, + {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, + {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, + {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, + {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, + {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, + {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, + {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, + {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, + {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, + {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, + {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, + {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, + {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -6320,7 +6420,7 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "psycopg2-binary" version = "2.9.7" description = "psycopg2 - Python-PostgreSQL Database Adapter" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "psycopg2-binary-2.9.7.tar.gz", hash = "sha256:1b918f64a51ffe19cd2e230b3240ba481330ce1d4b7875ae67305bd1d37b041c"}, @@ -6412,47 +6512,47 @@ files = [ [[package]] name = "pyarrow" -version = "14.0.1" +version = "16.1.0" description = "Python library for Apache Arrow" -optional = true +optional = false python-versions = ">=3.8" files = [ - {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"}, - {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"}, - {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"}, - {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"}, - {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"}, - {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"}, - {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"}, - {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"}, - {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"}, - {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"}, - {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"}, - {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"}, - {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"}, - {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"}, - {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"}, - {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"}, - {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"}, - {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"}, - {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"}, - {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"}, - {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"}, - {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"}, - {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"}, - {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"}, - {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"}, - {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"}, - {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"}, - {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"}, - {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"}, - {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"}, - {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"}, - {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"}, - {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"}, - {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"}, - {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"}, - {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"}, + {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"}, + {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"}, + {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"}, + {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"}, + {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"}, + {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"}, + {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"}, + {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"}, + {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"}, + {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"}, + {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"}, + {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"}, + {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"}, + {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"}, + {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"}, + {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"}, ] [package.dependencies] @@ -9287,4 +9387,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "605b9b04ed3ae8b71c41eaf532d7bc8ce4f8135ef00593b5f01a82debc3e14c8" +content-hash = "4584e2332c46a3a409ee605a1f03d110b765a491024ee96e44f62902c0769711" diff --git a/pyproject.toml b/pyproject.toml index cc18c37353..8e98445f02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -205,6 +205,9 @@ dbt-core = ">=1.2.0" dbt-duckdb = ">=1.2.0" pymongo = ">=4.3.3" pandas = ">2" +alive-progress = ">=3.0.1" +pyarrow = ">=16.0.0" +psycopg2-binary = ">=2.9" [tool.black] # https://black.readthedocs.io/en/stable/usage_and_configuration/the_basics.html#configuration-via-a-file line-length = 100 From 1c1ce7e04fa6fb347684abfc4741911ae1d647bf Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Thu, 6 Jun 2024 12:27:22 +0400 Subject: [PATCH 11/61] Add Delta table support for `filesystem` destination (#1382) * add delta table support for filesystem destination * Merge branch 'refs/heads/devel' into 978-filesystem-delta-table * remove duplicate method definition * make property robust * exclude high-precision decimal columns * make delta imports conditional * include pyarrow in deltalake dependency * install extra deltalake dependency * disable high precision decimal arrow test columns by default * include arrow max precision decimal column * introduce directory job and refactor delta table code * refactor delta table load * revert import changes * add delta table format child table handling * make table_format key lookups robust * write remote path to reference file * add supported table formats and file format adapter to destination capabilities * remove jsonl and parquet from table formats * add object_store rust crate credentials handling * add deltalake_storage_options to filesystem config * move function to top level to prevent multiprocessing pickle error * add new deltalake_storage_options filesystem config key to tests * replace secrets with dummy values in test * reorganize object_store rust crate credentials tests * add delta table format docs * move delta table logical delete logic to filesystem client * rename pyarrow lib method names * rename utils to delta_utils * import pyarrow from dlt common libs * move delta lake utitilities to module in dlt common libs * import delta lake utils early to assert dependencies availability * handle file format adaptation at table level * initialize file format variables * split delta table format tests * handle table schema is None case * add test for dynamic dispatching of delta tables * mark core delta table test as essential * simplify item normalizer dict key * make list copy to prevent in place mutations * add extra deltalake dependency * only test deltalake lib on local filesystem * properly evaluates lazy annotations * uses base FilesystemConfiguration from common in libs * solves union type reordering due to caching and clash with delta-rs DeltaTable method signature * creates a table with just root name to cache item normalizers properly --------- Co-authored-by: Jorrit Sandbrink Co-authored-by: Marcin Rudolf --- .github/workflows/test_common.yml | 2 +- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_local_destinations.yml | 2 +- .../configuration/specs/aws_credentials.py | 8 + .../configuration/specs/azure_credentials.py | 11 + .../configuration/specs/base_configuration.py | 11 +- .../configuration/specs/gcp_credentials.py | 13 +- dlt/common/destination/capabilities.py | 34 ++- dlt/common/destination/reference.py | 5 +- dlt/common/libs/deltalake.py | 90 +++++++ dlt/common/libs/pyarrow.py | 26 ++ dlt/common/libs/pydantic.py | 3 + dlt/common/schema/typing.py | 2 +- dlt/common/storages/configuration.py | 1 + dlt/common/storages/load_package.py | 2 +- dlt/destinations/impl/athena/__init__.py | 1 + dlt/destinations/impl/athena/athena.py | 10 +- dlt/destinations/impl/dummy/dummy.py | 5 +- dlt/destinations/impl/filesystem/__init__.py | 23 +- .../impl/filesystem/filesystem.py | 155 ++++++++--- dlt/destinations/job_client_impl.py | 7 +- dlt/destinations/job_impl.py | 2 +- dlt/extract/hints.py | 4 + dlt/load/load.py | 6 +- dlt/normalize/items_normalizers.py | 5 +- dlt/normalize/normalize.py | 74 ++++-- dlt/pipeline/pipeline.py | 4 +- .../dlt-ecosystem/destinations/filesystem.md | 34 +++ .../docs/dlt-ecosystem/table-formats/delta.md | 13 + .../dlt-ecosystem/table-formats/iceberg.md | 13 + poetry.lock | 140 +++------- pyproject.toml | 2 + tests/cases.py | 23 +- .../configuration/test_annotation_future.py | 25 ++ tests/libs/test_deltalake.py | 182 +++++++++++++ tests/libs/test_pydantic.py | 4 +- .../load/filesystem/test_filesystem_client.py | 1 - .../load/filesystem/test_filesystem_common.py | 3 + .../test_object_store_rs_credentials.py | 148 +++++++++++ .../load/pipeline/test_filesystem_pipeline.py | 250 +++++++++++++++++- tests/pipeline/utils.py | 64 ++++- 41 files changed, 1203 insertions(+), 207 deletions(-) create mode 100644 dlt/common/libs/deltalake.py create mode 100644 docs/website/docs/dlt-ecosystem/table-formats/delta.md create mode 100644 docs/website/docs/dlt-ecosystem/table-formats/iceberg.md create mode 100644 tests/common/configuration/test_annotation_future.py create mode 100644 tests/libs/test_deltalake.py create mode 100644 tests/load/filesystem/test_object_store_rs_credentials.py diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 4ab40d11f6..6b79060f07 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -123,7 +123,7 @@ jobs: shell: cmd - name: Install pipeline dependencies - run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk --with pipeline -E deltalake - run: | poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common tests/destinations diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 037f9da3e5..e75cd6c780 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -75,7 +75,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with sentry-sdk --with pipeline -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index dfe8e56735..263d3f588c 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -90,7 +90,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index ee49e79e40..97803a60e3 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -45,6 +45,14 @@ def to_session_credentials(self) -> Dict[str, str]: aws_session_token=self.aws_session_token, ) + def to_object_store_rs_credentials(self) -> Dict[str, str]: + # https://docs.rs/object_store/latest/object_store/aws + assert self.region_name is not None, "`object_store` Rust crate requires AWS region." + creds = self.to_session_credentials() + if creds["aws_session_token"] is None: + creds.pop("aws_session_token") + return {**creds, **{"region": self.region_name}} + @configspec class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 8b8fc259f2..7fa34fa00f 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -27,6 +27,13 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: sas_token=self.azure_storage_sas_token, ) + def to_object_store_rs_credentials(self) -> Dict[str, str]: + # https://docs.rs/object_store/latest/object_store/azure + creds = self.to_adlfs_credentials() + if creds["sas_token"] is None: + creds.pop("sas_token") + return creds + def create_sas_token(self) -> None: from azure.storage.blob import generate_account_sas, ResourceTypes @@ -61,6 +68,10 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: client_secret=self.azure_client_secret, ) + def to_object_store_rs_credentials(self) -> Dict[str, str]: + # https://docs.rs/object_store/latest/object_store/azure + return self.to_adlfs_credentials() + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 9ef756a2a6..1751b6ae13 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -1,5 +1,4 @@ import copy -import inspect import contextlib import dataclasses import warnings @@ -221,6 +220,11 @@ def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]: if att_name not in cls.__annotations__: raise ConfigFieldMissingTypeHintException(att_name, cls) hint = cls.__annotations__[att_name] + # resolve the annotation as per PEP 563 + # NOTE: we do not use get_type_hints because at this moment cls is an unknown name + # (ie. used as decorator and module is being imported) + if isinstance(hint, str): + hint = eval(hint) # context can have any type if not is_valid_hint(hint) and not is_context: @@ -321,7 +325,10 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: @classmethod def get_resolvable_fields(cls) -> Dict[str, type]: """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned""" - return {f.name: f.type for f in cls._get_resolvable_dataclass_fields()} + return { + f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type] + for f in cls._get_resolvable_dataclass_fields() + } def is_resolved(self) -> bool: return self.__is_resolved__ diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 9927b81ebf..a1d82fc577 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -1,6 +1,6 @@ import dataclasses import sys -from typing import Any, ClassVar, Final, List, Tuple, Union, Dict +from typing import Any, ClassVar, Final, List, Tuple, Union, Dict, Optional from dlt.common.json import json from dlt.common.pendulum import pendulum @@ -74,6 +74,7 @@ def to_gcs_credentials(self) -> Dict[str, Any]: @configspec class GcpServiceAccountCredentialsWithoutDefaults(GcpCredentials): private_key: TSecretValue = None + private_key_id: Optional[str] = None client_email: str = None type: Final[str] = dataclasses.field( # noqa: A003 default="service_account", init=False, repr=False, compare=False @@ -122,6 +123,10 @@ def to_native_credentials(self) -> Any: else: return ServiceAccountCredentials.from_service_account_info(self) + def to_object_store_rs_credentials(self) -> Dict[str, str]: + # https://docs.rs/object_store/latest/object_store/gcp + return {"service_account_key": json.dumps(dict(self))} + def __str__(self) -> str: return f"{self.client_email}@{self.project_id}" @@ -171,6 +176,12 @@ def parse_native_representation(self, native_value: Any) -> None: def to_native_representation(self) -> str: return json.dumps(self._info_dict()) + def to_object_store_rs_credentials(self) -> Dict[str, str]: + raise NotImplementedError( + "`object_store` Rust crate does not support OAuth for GCP credentials. Reference:" + " https://docs.rs/object_store/latest/object_store/gcp." + ) + def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) -> None: if not self.refresh_token: self.add_scopes(scopes) diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index 089b4a1d5e..e5ceb859f1 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -1,4 +1,15 @@ -from typing import Any, Callable, ClassVar, List, Literal, Optional, Sequence, Tuple, Set, get_args +from typing import ( + Any, + Callable, + ClassVar, + Literal, + Optional, + Sequence, + Tuple, + Set, + Protocol, + get_args, +) from dlt.common.configuration.utils import serialize_value from dlt.common.configuration import configspec @@ -9,7 +20,6 @@ DestinationLoadingWithoutStagingNotSupported, ) from dlt.common.utils import identity -from dlt.common.pendulum import pendulum from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.common.wei import EVM_DECIMAL_PRECISION @@ -23,12 +33,28 @@ ALL_SUPPORTED_FILE_FORMATS: Set[TLoaderFileFormat] = set(get_args(TLoaderFileFormat)) +class LoaderFileFormatAdapter(Protocol): + """Callback protocol for `loader_file_format_adapter` capability.""" + + def __call__( + self, + preferred_loader_file_format: TLoaderFileFormat, + supported_loader_file_formats: Sequence[TLoaderFileFormat], + /, + *, + table_schema: "TTableSchema", # type: ignore[name-defined] # noqa: F821 + ) -> Tuple[TLoaderFileFormat, Sequence[TLoaderFileFormat]]: ... + + @configspec class DestinationCapabilitiesContext(ContainerInjectableContext): """Injectable destination capabilities required for many Pipeline stages ie. normalize""" preferred_loader_file_format: TLoaderFileFormat = None supported_loader_file_formats: Sequence[TLoaderFileFormat] = None + loader_file_format_adapter: LoaderFileFormatAdapter = None + """Callable that adapts `preferred_loader_file_format` and `supported_loader_file_formats` at runtime.""" + supported_table_formats: Sequence["TTableFormat"] = None # type: ignore[name-defined] # noqa: F821 recommended_file_size: Optional[int] = None """Recommended file size in bytes when writing extract/load files""" preferred_staging_file_format: Optional[TLoaderFileFormat] = None @@ -65,14 +91,18 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): @staticmethod def generic_capabilities( preferred_loader_file_format: TLoaderFileFormat = None, + loader_file_format_adapter: LoaderFileFormatAdapter = None, + supported_table_formats: Sequence["TTableFormat"] = None, # type: ignore[name-defined] # noqa: F821 ) -> "DestinationCapabilitiesContext": from dlt.common.data_writers.escape import format_datetime_literal caps = DestinationCapabilitiesContext() caps.preferred_loader_file_format = preferred_loader_file_format caps.supported_loader_file_formats = ["jsonl", "insert_values", "parquet", "csv"] + caps.loader_file_format_adapter = loader_file_format_adapter caps.preferred_staging_file_format = None caps.supported_staging_file_formats = [] + caps.supported_table_formats = supported_table_formats or [] caps.escape_identifier = identity caps.escape_literal = serialize_value caps.format_datetime_literal = format_datetime_literal diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 4919711f58..9bb843a4c5 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -48,6 +48,7 @@ from dlt.common.schema.exceptions import UnknownTableException from dlt.common.storages import FileStorage from dlt.common.storages.load_storage import ParsedLoadJobFileName +from dlt.common.storages.load_package import LoadJobInfo TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"] TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") @@ -312,7 +313,9 @@ def should_truncate_table_before_load(self, table: TTableSchema) -> bool: return table["write_disposition"] == "replace" def create_table_chain_completed_followup_jobs( - self, table_chain: Sequence[TTableSchema] + self, + table_chain: Sequence[TTableSchema], + table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, ) -> List[NewLoadJob]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py new file mode 100644 index 0000000000..32847303f8 --- /dev/null +++ b/dlt/common/libs/deltalake.py @@ -0,0 +1,90 @@ +from typing import Optional, Dict, Union + +from dlt import version +from dlt.common import logger +from dlt.common.libs.pyarrow import pyarrow as pa +from dlt.common.libs.pyarrow import dataset_to_table, cast_arrow_schema_types +from dlt.common.schema.typing import TWriteDisposition +from dlt.common.exceptions import MissingDependencyException +from dlt.common.storages import FilesystemConfiguration + +try: + from deltalake import write_deltalake +except ModuleNotFoundError: + raise MissingDependencyException( + "dlt deltalake helpers", + [f"{version.DLT_PKG_NAME}[deltalake]"], + "Install `deltalake` so dlt can create Delta tables in the `filesystem` destination.", + ) + + +def ensure_delta_compatible_arrow_table(table: pa.table) -> pa.Table: + """Returns Arrow table compatible with Delta table format. + + Casts table schema to replace data types not supported by Delta. + """ + ARROW_TO_DELTA_COMPATIBLE_ARROW_TYPE_MAP = { + # maps type check function to type factory function + pa.types.is_null: pa.string(), + pa.types.is_time: pa.string(), + pa.types.is_decimal256: pa.string(), # pyarrow does not allow downcasting to decimal128 + } + adjusted_schema = cast_arrow_schema_types( + table.schema, ARROW_TO_DELTA_COMPATIBLE_ARROW_TYPE_MAP + ) + return table.cast(adjusted_schema) + + +def get_delta_write_mode(write_disposition: TWriteDisposition) -> str: + """Translates dlt write disposition to Delta write mode.""" + if write_disposition in ("append", "merge"): # `merge` disposition resolves to `append` + return "append" + elif write_disposition == "replace": + return "overwrite" + else: + raise ValueError( + "`write_disposition` must be `append`, `replace`, or `merge`," + f" but `{write_disposition}` was provided." + ) + + +def write_delta_table( + path: str, + data: Union[pa.Table, pa.dataset.Dataset], + write_disposition: TWriteDisposition, + storage_options: Optional[Dict[str, str]] = None, +) -> None: + """Writes in-memory Arrow table to on-disk Delta table.""" + + table = dataset_to_table(data) + + # throws warning for `s3` protocol: https://github.com/delta-io/delta-rs/issues/2460 + # TODO: upgrade `deltalake` lib after https://github.com/delta-io/delta-rs/pull/2500 + # is released + write_deltalake( # type: ignore[call-overload] + table_or_uri=path, + data=ensure_delta_compatible_arrow_table(table), + mode=get_delta_write_mode(write_disposition), + schema_mode="merge", # enable schema evolution (adding new columns) + storage_options=storage_options, + engine="rust", # `merge` schema mode requires `rust` engine + ) + + +def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str]: + """Returns dict that can be passed as `storage_options` in `deltalake` library.""" + creds = {} + extra_options = {} + if config.protocol in ("az", "gs", "s3"): + creds = config.credentials.to_object_store_rs_credentials() + if config.deltalake_storage_options is not None: + extra_options = config.deltalake_storage_options + shared_keys = creds.keys() & extra_options.keys() + if len(shared_keys) > 0: + logger.warning( + "The `deltalake_storage_options` configuration dictionary contains " + "keys also provided by dlt's credential system: " + + ", ".join([f"`{key}`" for key in shared_keys]) + + ". dlt will use the values in `deltalake_storage_options`." + ) + return {**creds, **extra_options} diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index d6ee5be4cd..28f3ddb598 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -28,6 +28,7 @@ import pyarrow import pyarrow.parquet import pyarrow.compute + import pyarrow.dataset except ModuleNotFoundError: raise MissingDependencyException( "dlt pyarrow helpers", @@ -37,6 +38,8 @@ TAnyArrowItem = Union[pyarrow.Table, pyarrow.RecordBatch] +ARROW_DECIMAL_MAX_PRECISION = 76 + def get_py_arrow_datatype( column: TColumnType, @@ -411,6 +414,29 @@ def pq_stream_with_new_columns( yield tbl +def dataset_to_table(data: Union[pyarrow.Table, pyarrow.dataset.Dataset]) -> pyarrow.Table: + return data.to_table() if isinstance(data, pyarrow.dataset.Dataset) else data + + +def cast_arrow_schema_types( + schema: pyarrow.Schema, + type_map: Dict[Callable[[pyarrow.DataType], bool], Callable[..., pyarrow.DataType]], +) -> pyarrow.Schema: + """Returns type-casted Arrow schema. + + Replaces data types for fields matching a type check in `type_map`. + Type check functions in `type_map` are assumed to be mutually exclusive, i.e. + a data type does not match more than one type check function. + """ + for i, e in enumerate(schema.types): + for type_check, cast_type in type_map.items(): + if type_check(e): + adjusted_field = schema.field(i).with_type(cast_type) + schema = schema.set(i, adjusted_field) + break # if type matches type check, do not do other type checks + return schema + + class NameNormalizationClash(ValueError): def __init__(self, reason: str) -> None: msg = f"Arrow column name clash after input data normalization. {reason}" diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index e6af064b8f..774a1641a7 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -106,6 +106,9 @@ def pydantic_to_table_schema_columns( inner_type = extract_inner_type(annotation) if is_union_type(inner_type): + # TODO: order those types deterministically before getting first one + # order of the types in union is in many cases not deterministic + # https://docs.python.org/3/library/typing.html#typing.get_args first_argument_type = get_args(inner_type)[0] inner_type = extract_inner_type(first_argument_type) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index b48770e4ef..fb360b38d3 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -64,7 +64,7 @@ "dedup_sort", ] """Known hints of a column used to declare hint regexes.""" -TTableFormat = Literal["iceberg", "parquet", "jsonl"] +TTableFormat = Literal["iceberg", "delta"] TTypeDetections = Literal[ "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double" ] diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index 6e100536af..09beb0015e 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -83,6 +83,7 @@ class FilesystemConfiguration(BaseConfiguration): """Indicates read only filesystem access. Will enable caching""" kwargs: Optional[DictStrAny] = None client_kwargs: Optional[DictStrAny] = None + deltalake_storage_options: Optional[DictStrAny] = None @property def protocol(self) -> str: diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index e7c7f7a164..4d72458e3e 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -141,7 +141,7 @@ def create_load_id() -> str: class ParsedLoadJobFileName(NamedTuple): - """Represents a file name of a job in load package. The file name contains name of a table, number of times the job was retired, extension + """Represents a file name of a job in load package. The file name contains name of a table, number of times the job was retried, extension and a 5 bytes random string to make job file name unique. The job id does not contain retry count and is immutable during loading of the data """ diff --git a/dlt/destinations/impl/athena/__init__.py b/dlt/destinations/impl/athena/__init__.py index 2577a6825e..87a11f9f41 100644 --- a/dlt/destinations/impl/athena/__init__.py +++ b/dlt/destinations/impl/athena/__init__.py @@ -11,6 +11,7 @@ def capabilities() -> DestinationCapabilitiesContext: # athena only supports loading from staged files on s3 for now caps.preferred_loader_file_format = None caps.supported_loader_file_formats = [] + caps.supported_table_formats = ["iceberg"] caps.preferred_staging_file_format = "parquet" caps.supported_staging_file_formats = ["parquet", "jsonl"] caps.escape_identifier = escape_athena_identifier diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 8f043ba4d5..60ea64a4e7 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -451,11 +451,11 @@ def _get_table_update_sql( {partition_clause} LOCATION '{location.rstrip('/')}' TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');""") - elif table_format == "jsonl": - sql.append(f"""CREATE EXTERNAL TABLE {qualified_table_name} - ({columns}) - ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' - LOCATION '{location}';""") + # elif table_format == "jsonl": + # sql.append(f"""CREATE EXTERNAL TABLE {qualified_table_name} + # ({columns}) + # ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' + # LOCATION '{location}';""") else: sql.append(f"""CREATE EXTERNAL TABLE {qualified_table_name} ({columns}) diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index 16affbc164..3c78493b57 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -16,6 +16,7 @@ from dlt.common.pendulum import pendulum from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.storages import FileStorage +from dlt.common.storages.load_package import LoadJobInfo from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.exceptions import ( DestinationTerminalException, @@ -157,7 +158,9 @@ def restore_file_load(self, file_path: str) -> LoadJob: return JOBS[job_id] def create_table_chain_completed_followup_jobs( - self, table_chain: Sequence[TTableSchema] + self, + table_chain: Sequence[TTableSchema], + table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, ) -> List[NewLoadJob]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] diff --git a/dlt/destinations/impl/filesystem/__init__.py b/dlt/destinations/impl/filesystem/__init__.py index 12e83216cf..49fabd61d7 100644 --- a/dlt/destinations/impl/filesystem/__init__.py +++ b/dlt/destinations/impl/filesystem/__init__.py @@ -1,5 +1,24 @@ -from dlt.common.destination import DestinationCapabilitiesContext +from typing import Sequence, Tuple + +from dlt.common.schema.typing import TTableSchema +from dlt.common.destination import DestinationCapabilitiesContext, TLoaderFileFormat + + +def loader_file_format_adapter( + preferred_loader_file_format: TLoaderFileFormat, + supported_loader_file_formats: Sequence[TLoaderFileFormat], + /, + *, + table_schema: TTableSchema, +) -> Tuple[TLoaderFileFormat, Sequence[TLoaderFileFormat]]: + if table_schema.get("table_format") == "delta": + return ("parquet", ["parquet"]) + return (preferred_loader_file_format, supported_loader_file_formats) def capabilities() -> DestinationCapabilitiesContext: - return DestinationCapabilitiesContext.generic_capabilities("jsonl") + return DestinationCapabilitiesContext.generic_capabilities( + preferred_loader_file_format="jsonl", + loader_file_format_adapter=loader_file_format_adapter, + supported_table_formats=["delta"], + ) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index d75226be13..9d15ba959e 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -1,18 +1,18 @@ import posixpath -import pathlib import os import base64 + from types import TracebackType -from typing import ClassVar, List, Type, Iterable, Set, Iterator, Optional, Tuple, cast +from typing import ClassVar, List, Type, Iterable, Iterator, Optional, Tuple, Sequence, cast from fsspec import AbstractFileSystem from contextlib import contextmanager -from dlt.common import json, pendulum -from dlt.common.typing import DictStrAny import dlt -from dlt.common import logger, time +from dlt.common import logger, time, json, pendulum +from dlt.common.typing import DictStrAny from dlt.common.schema import Schema, TSchemaTables, TTableSchema from dlt.common.storages import FileStorage, fsspec_from_config +from dlt.common.storages.load_package import LoadJobInfo, ParsedLoadJobFileName from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( NewLoadJob, @@ -25,9 +25,10 @@ StorageSchemaInfo, StateInfo, DoNothingJob, + DoNothingFollowupJob, ) from dlt.common.destination.exceptions import DestinationUndefinedEntity -from dlt.destinations.job_impl import EmptyLoadJob +from dlt.destinations.job_impl import EmptyLoadJob, NewReferenceJob from dlt.destinations.impl.filesystem import capabilities from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration from dlt.destinations.job_impl import NewReferenceJob @@ -41,57 +42,48 @@ class LoadFilesystemJob(LoadJob): def __init__( self, + client: "FilesystemClient", local_path: str, - dataset_path: str, - *, - config: FilesystemDestinationClientConfiguration, - schema_name: str, load_id: str, + table: TTableSchema, ) -> None: - file_name = FileStorage.get_file_name_from_file_path(local_path) - self.config = config - self.dataset_path = dataset_path - self.is_local_filesystem = config.protocol == "file" + self.client = client + self.table = table + self.is_local_filesystem = client.config.protocol == "file" # pick local filesystem pathlib or posix for buckets self.pathlib = os.path if self.is_local_filesystem else posixpath + + file_name = FileStorage.get_file_name_from_file_path(local_path) + super().__init__(file_name) + self.destination_file_name = path_utils.create_path( - config.layout, + client.config.layout, file_name, - schema_name, + client.schema.name, load_id, - current_datetime=config.current_datetime, + current_datetime=client.config.current_datetime, load_package_timestamp=dlt.current.load_package()["state"]["created_at"], - extra_placeholders=config.extra_placeholders, + extra_placeholders=client.config.extra_placeholders, ) - - super().__init__(file_name) - fs_client, _ = fsspec_from_config(config) # We would like to avoid failing for local filesystem where # deeply nested directory will not exist before writing a file. # It `auto_mkdir` is disabled by default in fsspec so we made some # trade offs between different options and decided on this. - item = self.make_remote_path() + # remote_path = f"{client.config.protocol}://{posixpath.join(dataset_path, destination_file_name)}" + remote_path = self.make_remote_path() if self.is_local_filesystem: - fs_client.makedirs(self.pathlib.dirname(item), exist_ok=True) - fs_client.put_file(local_path, item) + client.fs_client.makedirs(self.pathlib.dirname(remote_path), exist_ok=True) + client.fs_client.put_file(local_path, remote_path) def make_remote_path(self) -> str: """Returns path on the remote filesystem to which copy the file, without scheme. For local filesystem a native path is used""" # path.join does not normalize separators and available # normalization functions are very invasive and may string the trailing separator return self.pathlib.join( # type: ignore[no-any-return] - self.dataset_path, + self.client.dataset_path, path_utils.normalize_path_sep(self.pathlib, self.destination_file_name), ) - def make_remote_uri(self) -> str: - """Returns uri to the remote filesystem to which copy the file""" - remote_path = self.make_remote_path() - if self.is_local_filesystem: - return self.config.make_file_uri(remote_path) - else: - return f"{self.config.protocol}://{remote_path}" - def state(self) -> TLoadJobState: return "completed" @@ -99,12 +91,60 @@ def exception(self) -> str: raise NotImplementedError() +class DeltaLoadFilesystemJob(NewReferenceJob): + def __init__( + self, + client: "FilesystemClient", + table: TTableSchema, + table_jobs: Sequence[LoadJobInfo], + ) -> None: + self.client = client + self.table = table + self.table_jobs = table_jobs + + ref_file_name = ParsedLoadJobFileName( + table["name"], ParsedLoadJobFileName.new_file_id(), 0, "reference" + ).file_name() + super().__init__( + file_name=ref_file_name, + status="running", + remote_path=self.client.make_remote_uri(self.make_remote_path()), + ) + + self.write() + + def write(self) -> None: + from dlt.common.libs.pyarrow import pyarrow as pa + from dlt.common.libs.deltalake import ( + write_delta_table, + _deltalake_storage_options, + ) + + file_paths = [job.file_path for job in self.table_jobs] + + write_delta_table( + path=self.client.make_remote_uri(self.make_remote_path()), + data=pa.dataset.dataset(file_paths), + write_disposition=self.table["write_disposition"], + storage_options=_deltalake_storage_options(self.client.config), + ) + + def make_remote_path(self) -> str: + # directory path, not file path + return self.client.get_table_dir(self.table["name"]) + + def state(self) -> TLoadJobState: + return "completed" + + class FollowupFilesystemJob(FollowupJob, LoadFilesystemJob): def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]: jobs = super().create_followup_jobs(final_state) if final_state == "completed": ref_job = NewReferenceJob( - file_name=self.file_name(), status="running", remote_path=self.make_remote_uri() + file_name=self.file_name(), + status="running", + remote_path=self.client.make_remote_uri(self.make_remote_path()), ) jobs.append(ref_job) return jobs @@ -282,19 +322,24 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> # where we want to load the state the regular way if table["name"] == self.schema.state_table_name and not self.config.as_staging: return DoNothingJob(file_path) + if table.get("table_format") == "delta": + import dlt.common.libs.deltalake # assert dependencies are installed + + return DoNothingFollowupJob(file_path) cls = FollowupFilesystemJob if self.config.as_staging else LoadFilesystemJob - return cls( - file_path, - self.dataset_path, - config=self.config, - schema_name=self.schema.name, - load_id=load_id, - ) + return cls(self, file_path, load_id, table) def restore_file_load(self, file_path: str) -> LoadJob: return EmptyLoadJob.from_file_path(file_path, "completed") + def make_remote_uri(self, remote_path: str) -> str: + """Returns uri to the remote filesystem to which copy the file""" + if self.is_local_filesystem: + return self.config.make_file_uri(remote_path) + else: + return f"{self.config.protocol}://{remote_path}" + def __enter__(self) -> "FilesystemClient": return self @@ -306,6 +351,12 @@ def __exit__( def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: return False + def should_truncate_table_before_load(self, table: TTableSchema) -> bool: + return ( + table["write_disposition"] == "replace" + and not table.get("table_format") == "delta" # Delta can do a logical replace + ) + # # state stuff # @@ -473,3 +524,25 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]: def get_stored_schema_by_hash(self, version_hash: str) -> Optional[StorageSchemaInfo]: return self._get_stored_schema_by_hash_or_newest(version_hash) + + def create_table_chain_completed_followup_jobs( + self, + table_chain: Sequence[TTableSchema], + table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, + ) -> List[NewLoadJob]: + def get_table_jobs( + table_jobs: Sequence[LoadJobInfo], table_name: str + ) -> Sequence[LoadJobInfo]: + return [job for job in table_jobs if job.job_file_info.table_name == table_name] + + assert table_chain_jobs is not None + jobs = super().create_table_chain_completed_followup_jobs(table_chain, table_chain_jobs) + table_format = table_chain[0].get("table_format") + if table_format == "delta": + delta_jobs = [ + DeltaLoadFilesystemJob(self, table, get_table_jobs(table_chain_jobs, table["name"])) + for table in table_chain + ] + jobs.extend(delta_jobs) + + return jobs diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 853972fcba..ac3636db2b 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -36,6 +36,7 @@ TTableFormat, ) from dlt.common.storages import FileStorage +from dlt.common.storages.load_package import LoadJobInfo from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME from dlt.common.destination.reference import ( @@ -247,10 +248,12 @@ def _create_replace_followup_jobs( return jobs def create_table_chain_completed_followup_jobs( - self, table_chain: Sequence[TTableSchema] + self, + table_chain: Sequence[TTableSchema], + table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, ) -> List[NewLoadJob]: """Creates a list of followup jobs for merge write disposition and staging replace strategies""" - jobs = super().create_table_chain_completed_followup_jobs(table_chain) + jobs = super().create_table_chain_completed_followup_jobs(table_chain, table_chain_jobs) write_disposition = table_chain[0]["write_disposition"] if write_disposition == "append": jobs.extend(self._create_append_followup_jobs(table_chain)) diff --git a/dlt/destinations/job_impl.py b/dlt/destinations/job_impl.py index 218f73cc59..a4e4b998af 100644 --- a/dlt/destinations/job_impl.py +++ b/dlt/destinations/job_impl.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod import os import tempfile # noqa: 251 -from typing import Dict, Iterable, List +from typing import Dict, Iterable, List, Optional from dlt.common.json import json from dlt.common.destination.reference import NewLoadJob, FollowupJob, TLoadJobState, LoadJob diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 287474c82c..6fd1928970 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -158,6 +158,10 @@ def columns(self) -> TTableHintTemplate[TTableSchemaColumns]: def schema_contract(self) -> TTableHintTemplate[TSchemaContract]: return self._hints.get("schema_contract") + @property + def table_format(self) -> TTableHintTemplate[TTableFormat]: + return None if self._hints is None else self._hints.get("table_format") + def compute_table_schema(self, item: TDataItem = None, meta: Any = None) -> TTableSchema: """Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data. diff --git a/dlt/load/load.py b/dlt/load/load.py index d96a6b7116..8c7eb431e8 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -256,8 +256,12 @@ def create_followup_jobs( if table_chain := get_completed_table_chain( schema, all_jobs, top_job_table, starting_job.job_file_info().job_id() ): + table_chain_names = [table["name"] for table in table_chain] + table_chain_jobs = [ + job for job in all_jobs if job.job_file_info.table_name in table_chain_names + ] if follow_up_jobs := client.create_table_chain_completed_followup_jobs( - table_chain + table_chain, table_chain_jobs ): jobs = jobs + follow_up_jobs jobs = jobs + starting_job.create_followup_jobs(state) diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 81220da2dd..eed98d7563 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -9,10 +9,7 @@ from dlt.common.runtime import signals from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns, TSchemaContractDict from dlt.common.schema.utils import has_table_seen_data -from dlt.common.storages import ( - NormalizeStorage, - LoadStorage, -) +from dlt.common.storages import NormalizeStorage from dlt.common.storages.data_item_storage import DataItemStorage from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.typing import DictStrAny, TDataItem diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index b90c15a5f7..75cb9be707 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -20,7 +20,7 @@ from dlt.common.runners import TRunMetrics, Runnable, NullExecutor from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.schema.typing import TStoredSchema +from dlt.common.schema.typing import TStoredSchema, TTableSchema from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages import ( NormalizeStorage, @@ -110,33 +110,76 @@ def w_normalize_files( ) -> TWorkerRV: destination_caps = config.destination_capabilities schema_updates: List[TSchemaUpdate] = [] - item_normalizers: Dict[TDataItemFormat, ItemsNormalizer] = {} - # Use default storage if parquet is not supported to make normalizer fallback to read rows from the file + # normalizers are cached per table name + item_normalizers: Dict[str, ItemsNormalizer] = {} + preferred_file_format = ( destination_caps.preferred_loader_file_format or destination_caps.preferred_staging_file_format ) # TODO: capabilities.supported_*_formats can be None, it should have defaults - supported_formats = destination_caps.supported_loader_file_formats or [] + supported_file_formats = destination_caps.supported_loader_file_formats or [] + supported_table_formats = destination_caps.supported_table_formats or [] # process all files with data items and write to buffered item storage with Container().injectable_context(destination_caps): schema = Schema.from_stored_schema(stored_schema) normalize_storage = NormalizeStorage(False, normalize_storage_config) - load_storage = LoadStorage(False, supported_formats, loader_storage_config) + load_storage = LoadStorage(False, supported_file_formats, loader_storage_config) + + def _get_items_normalizer( + item_format: TDataItemFormat, table_schema: Optional[TTableSchema] + ) -> ItemsNormalizer: + table_name = table_schema["name"] + if table_name in item_normalizers: + return item_normalizers[table_name] + + if ( + "table_format" in table_schema + and table_schema["table_format"] not in supported_table_formats + ): + logger.warning( + "Destination does not support the configured `table_format` value " + f"`{table_schema['table_format']}` for table `{table_schema['name']}`. " + "The setting will probably be ignored." + ) + + items_preferred_file_format = preferred_file_format + items_supported_file_formats = supported_file_formats + if destination_caps.loader_file_format_adapter is not None: + items_preferred_file_format, items_supported_file_formats = ( + destination_caps.loader_file_format_adapter( + preferred_file_format, + ( + supported_file_formats.copy() + if isinstance(supported_file_formats, list) + else supported_file_formats + ), + table_schema=table_schema, + ) + ) - def _get_items_normalizer(item_format: TDataItemFormat) -> ItemsNormalizer: - if item_format in item_normalizers: - return item_normalizers[item_format] # force file format + best_writer_spec = None if config.loader_file_format: - # TODO: pass supported_formats, when used in pipeline we already checked that - # but if normalize is used standalone `supported_loader_file_formats` may be unresolved - best_writer_spec = get_best_writer_spec(item_format, config.loader_file_format) - else: + if config.loader_file_format in items_supported_file_formats: + # TODO: pass supported_file_formats, when used in pipeline we already checked that + # but if normalize is used standalone `supported_loader_file_formats` may be unresolved + best_writer_spec = get_best_writer_spec( + item_format, config.loader_file_format + ) + else: + logger.warning( + f"The configured value `{config.loader_file_format}` " + "for `loader_file_format` is not supported for table " + f"`{table_schema['name']}` and will be ignored. Dlt " + "will use a supported format instead." + ) + + if best_writer_spec is None: # find best spec among possible formats taking into account destination preference best_writer_spec = resolve_best_writer_spec( - item_format, supported_formats, preferred_file_format + item_format, items_supported_file_formats, items_preferred_file_format ) # if best_writer_spec.file_format != preferred_file_format: # logger.warning( @@ -159,7 +202,7 @@ def _get_items_normalizer(item_format: TDataItemFormat) -> ItemsNormalizer: f" {item_storage.writer_cls.__name__} for item format {item_format} and file" f" format {item_storage.writer_spec.file_format}" ) - norm = item_normalizers[item_format] = cls( + norm = item_normalizers[table_name] = cls( item_storage, normalize_storage, schema, @@ -211,7 +254,8 @@ def _gather_metrics_and_close( ) root_tables.add(root_table_name) normalizer = _get_items_normalizer( - DataWriter.item_format_from_file_extension(parsed_file_name.file_format) + DataWriter.item_format_from_file_extension(parsed_file_name.file_format), + stored_schema["tables"].get(root_table_name, {"name": root_table_name}), ) logger.debug( f"Processing extracted items in {extracted_items_file} in load_id" diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index b0fb6fe57c..8dfb93b8da 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -104,7 +104,7 @@ from dlt.common.warnings import deprecated, Dlt04DeprecationWarning from dlt.common.versioned_state import json_encode_state, json_decode_state -from dlt.extract import DltSource +from dlt.extract import DltSource, DltResource from dlt.extract.exceptions import SourceExhausted from dlt.extract.extract import Extract, data_to_sources from dlt.normalize import Normalize @@ -662,6 +662,7 @@ def run( Returns: LoadInfo: Information on loaded data including the list of package ids and failed job statuses. Please not that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. """ + signals.raise_if_signalled() self.activate() self._set_destinations(destination=destination, staging=staging) @@ -679,7 +680,6 @@ def run( self.sync_destination(destination, staging, dataset_name) # sync only once self._state_restored = True - # normalize and load pending data if self.list_extracted_load_packages(): self.normalize(loader_file_format=loader_file_format) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 9c7d961d3a..e93ffb54d4 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -480,6 +480,40 @@ You can choose the following file formats: * [parquet](../file-formats/parquet.md) is supported * [csv](../file-formats/csv.md) is supported +## Supported table formats +You can choose the following table formats: +* [Delta](../table-formats/delta.md) is supported + +### Delta table format +You need the `deltalake` package to use this format: + +```sh +pip install "dlt[deltalake]" +``` + +Set the `table_format` argument to `delta` when defining your resource: + +```py +@dlt.resource(table_format="delta") +def my_delta_resource(): + ... +``` + +> `dlt` always uses `parquet` as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded. + +#### Storage options +You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: + +```toml +[destination.filesystem] +deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' +``` + +`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. + +You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided, before passing it as `storage_options`. + +>❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. ## Syncing of `dlt` state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files that will be created at your destination which hold information about your pipeline state, schemas and completed loads. These folders DO NOT respect your diff --git a/docs/website/docs/dlt-ecosystem/table-formats/delta.md b/docs/website/docs/dlt-ecosystem/table-formats/delta.md new file mode 100644 index 0000000000..7840f40d11 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/table-formats/delta.md @@ -0,0 +1,13 @@ +--- +title: Delta +description: The Delta table format +keywords: [delta, table formats] +--- + +# Delta table format + +[Delta](https://delta.io/) is an open source table format. `dlt` can store data as Delta tables. + +## Supported Destinations + +Supported by: **Databricks**, **filesystem** diff --git a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md new file mode 100644 index 0000000000..a34bab9a0c --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md @@ -0,0 +1,13 @@ +--- +title: Iceberg +description: The Iceberg table format +keywords: [iceberg, table formats] +--- + +# Iceberg table format + +[Iceberg](https://iceberg.apache.org/) is an open source table format. `dlt` can store data as Iceberg tables. + +## Supported Destinations + +Supported by: **Athena** diff --git a/poetry.lock b/poetry.lock index b476bc4a9f..e61d505a4a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -2400,6 +2400,30 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "deltalake" +version = "0.17.4" +description = "Native Delta Lake Python binding based on delta-rs with Pandas integration" +optional = true +python-versions = ">=3.8" +files = [ + {file = "deltalake-0.17.4-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3f048bd4cdd3500fbb0d1b34046966ca4b7cefd1e9df71460b881ee8ad7f844a"}, + {file = "deltalake-0.17.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:b539265d8293794872e1dc3b2daad50abe05ab425e961824b3ac1155bb294604"}, + {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55e6be5f5ab8d5d34d2ea58d86e93eec2da5d2476e3c15e9520239457618bca4"}, + {file = "deltalake-0.17.4-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94dde6c2d0a07e9ce47be367d016541d3a499839350852205819353441e1a9c1"}, + {file = "deltalake-0.17.4-cp38-abi3-win_amd64.whl", hash = "sha256:f51f499d50dad88bdc18c5ed7c2319114759f3220f83aa2d32166c19accee4ce"}, + {file = "deltalake-0.17.4.tar.gz", hash = "sha256:c3c10577afc46d4b10ed16246d814a8c40b3663099066681eeba89f908373814"}, +] + +[package.dependencies] +pyarrow = ">=8" +pyarrow-hotfix = "*" + +[package.extras] +devel = ["mypy (>=1.8.0,<1.9.0)", "packaging (>=20)", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-timeout", "ruff (>=0.3.0,<0.4.0)", "sphinx (<=4.5)", "sphinx-rtd-theme", "toml", "wheel"] +pandas = ["pandas"] +pyspark = ["delta-spark", "numpy (==1.22.2)", "pyspark"] + [[package]] name = "deprecated" version = "1.2.14" @@ -3680,106 +3704,6 @@ files = [ {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, - {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, - {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, - {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, - {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, - {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, - {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, - {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, - {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, - {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, - {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, - {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, - {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, - {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, - {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, - {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, - {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, - {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, - {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, - {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, - {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, - {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, - {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, - {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, - {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, - {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, - {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, - {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, - {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, - {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, - {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, - {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, - {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, - {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, - {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, - {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, - {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, - {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, - {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, - {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, - {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -6558,6 +6482,17 @@ files = [ [package.dependencies] numpy = ">=1.16.6" +[[package]] +name = "pyarrow-hotfix" +version = "0.6" +description = "" +optional = true +python-versions = ">=3.5" +files = [ + {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"}, + {file = "pyarrow_hotfix-0.6.tar.gz", hash = "sha256:79d3e030f7ff890d408a100ac16d6f00b14d44a502d7897cd9fc3e3a534e9945"}, +] + [[package]] name = "pyasn1" version = "0.5.0" @@ -9368,6 +9303,7 @@ cli = ["cron-descriptor", "pipdeptree"] clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyarrow", "s3fs"] databricks = ["databricks-sql-connector"] dbt = ["dbt-athena-community", "dbt-bigquery", "dbt-core", "dbt-databricks", "dbt-duckdb", "dbt-redshift", "dbt-snowflake"] +deltalake = ["deltalake", "pyarrow"] dremio = ["pyarrow"] duckdb = ["duckdb"] filesystem = ["botocore", "s3fs"] @@ -9387,4 +9323,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "4584e2332c46a3a409ee605a1f03d110b765a491024ee96e44f62902c0769711" +content-hash = "d9034fc091a6e823373e742530d67b9c075d329afd2fee3bad7467716d2b2b9a" diff --git a/pyproject.toml b/pyproject.toml index 8e98445f02..9086acea9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,6 +85,7 @@ databricks-sql-connector = {version = ">=2.9.3,<3.0.0", optional = true} dbt-databricks = {version = ">=1.7.3", optional = true} clickhouse-driver = { version = ">=0.2.7", optional = true } clickhouse-connect = { version = ">=0.7.7", optional = true } +deltalake = { version = ">=0.17.4", optional = true } [tool.poetry.extras] dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"] @@ -110,6 +111,7 @@ qdrant = ["qdrant-client"] databricks = ["databricks-sql-connector"] clickhouse = ["clickhouse-driver", "clickhouse-connect", "s3fs", "gcsfs", "adlfs", "pyarrow"] dremio = ["pyarrow"] +deltalake = ["deltalake", "pyarrow"] [tool.poetry.scripts] dlt = "dlt.cli._dlt:_main" diff --git a/tests/cases.py b/tests/cases.py index 2b655fdc8b..d145ec1d94 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -257,6 +257,11 @@ def assert_all_data_types_row( else: db_mapping[binary_col] = bytes(db_mapping[binary_col]) + # `delta` table format stores `wei` type as string + if "col8" in db_mapping: + if isinstance(db_mapping["col8"], str): + db_mapping["col8"] = int(db_mapping["col8"]) + # redshift and bigquery return strings from structured fields if "col9" in db_mapping: if isinstance(db_mapping["col9"], str): @@ -270,7 +275,7 @@ def assert_all_data_types_row( if "col10" in db_mapping: db_mapping["col10"] = db_mapping["col10"].isoformat() if "col11" in db_mapping: - db_mapping["col11"] = db_mapping["col11"].isoformat() + db_mapping["col11"] = ensure_pendulum_time(db_mapping["col11"]).isoformat() if expect_filtered_null_columns: for key, expected in expected_rows.items(): @@ -291,6 +296,8 @@ def arrow_table_all_data_types( include_time: bool = True, include_binary: bool = True, include_decimal: bool = True, + include_decimal_default_precision: bool = False, + include_decimal_arrow_max_precision: bool = False, include_date: bool = True, include_not_normalized_name: bool = True, include_name_clash: bool = False, @@ -337,6 +344,20 @@ def arrow_table_all_data_types( if include_decimal: data["decimal"] = [Decimal(str(round(random.uniform(0, 100), 4))) for _ in range(num_rows)] + if include_decimal_default_precision: + from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION + + data["decimal_default_precision"] = [ + Decimal(int("1" * DEFAULT_NUMERIC_PRECISION)) for _ in range(num_rows) + ] + + if include_decimal_arrow_max_precision: + from dlt.common.libs.pyarrow import ARROW_DECIMAL_MAX_PRECISION + + data["decimal_arrow_max_precision"] = [ + Decimal(int("1" * ARROW_DECIMAL_MAX_PRECISION)) for _ in range(num_rows) + ] + if include_date: data["date"] = pd.date_range("2021-01-01", periods=num_rows, tz=tz).date diff --git a/tests/common/configuration/test_annotation_future.py b/tests/common/configuration/test_annotation_future.py new file mode 100644 index 0000000000..800d689fb7 --- /dev/null +++ b/tests/common/configuration/test_annotation_future.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from typing import Optional + +from dlt.common.configuration import configspec +from dlt.common.configuration.resolve import resolve_configuration +from dlt.common.configuration.specs import BaseConfiguration + +from tests.utils import preserve_environ +from tests.common.configuration.utils import environment + + +def test_str_annotations(environment) -> None: + @configspec + class DataConf(BaseConfiguration): + x: int = None + x_7: Optional[int] = 3 + + assert DataConf.__annotations__ == {"x": "int", "x_7": "Optional[int]"} + assert DataConf.get_resolvable_fields() == {"x": int, "x_7": Optional[int]} + + # resolve it + environment["X"] = "10" + c = resolve_configuration(DataConf()) + assert c.x == 10 diff --git a/tests/libs/test_deltalake.py b/tests/libs/test_deltalake.py new file mode 100644 index 0000000000..d55f788fbe --- /dev/null +++ b/tests/libs/test_deltalake.py @@ -0,0 +1,182 @@ +import os +from typing import Iterator, Tuple, cast + +import pytest +from deltalake import DeltaTable + +import dlt +from dlt.common.libs.pyarrow import pyarrow as pa +from dlt.common.libs.deltalake import ( + write_delta_table, + _deltalake_storage_options, +) +from dlt.common.configuration.specs import AwsCredentials +from dlt.destinations.impl.filesystem.filesystem import ( + FilesystemClient, + FilesystemDestinationClientConfiguration, +) + +from tests.cases import arrow_table_all_data_types + + +@pytest.fixture() +def filesystem_client() -> Iterator[Tuple[FilesystemClient, str]]: + """Returns tuple of local `FilesystemClient` instance and remote directory string. + + Remote directory is removed on teardown. + """ + # setup + os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "_storage" + client = cast(FilesystemClient, dlt.pipeline(destination="filesystem").destination_client()) + remote_dir = os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] + "/tmp_dir" + + yield (client, remote_dir) + + # teardown + if client.fs_client.exists(remote_dir): + client.fs_client.rm(remote_dir, recursive=True) + + +def test_deltalake_storage_options() -> None: + config = FilesystemDestinationClientConfiguration() + + # no credentials, no deltalake_storage_options + config.bucket_url = "_storage://foo" + assert _deltalake_storage_options(config) == dict() + + # no credentials, yes deltalake_storage_options + config.deltalake_storage_options = {"foo": "bar"} + assert _deltalake_storage_options(config) == {"foo": "bar"} + + # yes credentials, yes deltalake_storage_options: no shared keys + creds = AwsCredentials( + aws_access_key_id="dummy_key_id", + aws_secret_access_key="dummy_acces_key", # type: ignore[arg-type] + aws_session_token="dummy_session_token", # type: ignore[arg-type] + region_name="dummy_region_name", + ) + config.credentials = creds + config.bucket_url = "s3://foo" + assert _deltalake_storage_options(config).keys() == { + "aws_access_key_id", + "aws_secret_access_key", + "aws_session_token", + "region", + "foo", + } + + # yes credentials, yes deltalake_storage_options: yes shared keys + config.deltalake_storage_options = {"aws_access_key_id": "i_will_overwrite"} + assert _deltalake_storage_options(config).keys() == { + "aws_access_key_id", + "aws_secret_access_key", + "aws_session_token", + "region", + } + assert _deltalake_storage_options(config)["aws_access_key_id"] == "i_will_overwrite" + + +def test_write_delta_table(filesystem_client) -> None: + client, remote_dir = filesystem_client + client = cast(FilesystemClient, client) + storage_options = _deltalake_storage_options(client.config) + + with pytest.raises(Exception): + # bug in `delta-rs` causes error when writing big decimal values + # https://github.com/delta-io/delta-rs/issues/2510 + # if this test fails, the bug has been fixed and we should remove this + # note from the docs: + write_delta_table( + remote_dir + "/corrupt_delta_table", + arrow_table_all_data_types("arrow-table", include_decimal_default_precision=True)[0], + write_disposition="append", + storage_options=storage_options, + ) + + arrow_table = arrow_table_all_data_types( + "arrow-table", + include_decimal_default_precision=False, + include_decimal_arrow_max_precision=True, + num_rows=2, + )[0] + + # first write should create Delta table with same shape as input Arrow table + write_delta_table( + remote_dir, arrow_table, write_disposition="append", storage_options=storage_options + ) + dt = DeltaTable(remote_dir, storage_options=storage_options) + assert dt.version() == 0 + dt_arrow_table = dt.to_pyarrow_table() + assert dt_arrow_table.shape == (arrow_table.num_rows, arrow_table.num_columns) + + # table contents should be different because "time" column has type `string` + # in Delta table, but type `time` in Arrow source table + assert not dt_arrow_table.equals(arrow_table) + casted_cols = ("null", "time", "decimal_arrow_max_precision") + assert dt_arrow_table.drop_columns(casted_cols).equals(arrow_table.drop_columns(casted_cols)) + + # another `append` should create a new table version with twice the number of rows + write_delta_table( + remote_dir, arrow_table, write_disposition="append", storage_options=storage_options + ) + dt = DeltaTable(remote_dir, storage_options=storage_options) + assert dt.version() == 1 + assert dt.to_pyarrow_table().shape == (arrow_table.num_rows * 2, arrow_table.num_columns) + + # the `replace` write disposition should trigger a "logical delete" + write_delta_table( + remote_dir, arrow_table, write_disposition="replace", storage_options=storage_options + ) + dt = DeltaTable(remote_dir, storage_options=storage_options) + assert dt.version() == 2 + assert dt.to_pyarrow_table().shape == (arrow_table.num_rows, arrow_table.num_columns) + + # the previous table version should still exist + dt.load_version(1) + assert dt.to_pyarrow_table().shape == (arrow_table.num_rows * 2, arrow_table.num_columns) + + # `merge` should resolve to `append` bevavior + write_delta_table( + remote_dir, arrow_table, write_disposition="merge", storage_options=storage_options + ) + dt = DeltaTable(remote_dir, storage_options=storage_options) + assert dt.version() == 3 + assert dt.to_pyarrow_table().shape == (arrow_table.num_rows * 2, arrow_table.num_columns) + + # add column in source table + evolved_arrow_table = arrow_table.append_column( + "new", pa.array([1 for _ in range(arrow_table.num_rows)]) + ) + assert ( + evolved_arrow_table.num_columns == arrow_table.num_columns + 1 + ) # ensure column was appendend + + # new column should be propagated to Delta table (schema evolution is supported) + write_delta_table( + remote_dir, evolved_arrow_table, write_disposition="append", storage_options=storage_options + ) + dt = DeltaTable(remote_dir, storage_options=storage_options) + assert dt.version() == 4 + dt_arrow_table = dt.to_pyarrow_table() + assert dt_arrow_table.shape == (arrow_table.num_rows * 3, evolved_arrow_table.num_columns) + assert "new" in dt_arrow_table.schema.names + assert dt_arrow_table.column("new").to_pylist() == [1, 1, None, None, None, None] + + # providing a subset of columns should lead to missing columns being null-filled + write_delta_table( + remote_dir, arrow_table, write_disposition="append", storage_options=storage_options + ) + dt = DeltaTable(remote_dir, storage_options=storage_options) + assert dt.version() == 5 + dt_arrow_table = dt.to_pyarrow_table() + assert dt_arrow_table.shape == (arrow_table.num_rows * 4, evolved_arrow_table.num_columns) + assert dt_arrow_table.column("new").to_pylist() == [None, None, 1, 1, None, None, None, None] + + with pytest.raises(ValueError): + # unsupported value for `write_disposition` should raise ValueError + write_delta_table( + remote_dir, + arrow_table, + write_disposition="foo", # type:ignore[arg-type] + storage_options=storage_options, + ) diff --git a/tests/libs/test_pydantic.py b/tests/libs/test_pydantic.py index d6dc29e0c8..951eabbde4 100644 --- a/tests/libs/test_pydantic.py +++ b/tests/libs/test_pydantic.py @@ -161,7 +161,9 @@ class User(BaseModel): address: Annotated[UserAddress, "PII", "address"] uuid_or_str: Union[str, UUID4, None] unity: Union[UserAddress, UserLabel, Dict[str, UserAddress]] - location: Annotated[Optional[Union[str, List[str]]], None] + # NOTE: added "int" because this type was clashing with a type + # in a delta-rs library that got cached and that re-orders the union + location: Annotated[Optional[Union[str, List[str], int]], None] something_required: Annotated[Union[str, int], type(None)] final_location: Final[Annotated[Union[str, int], None]] # type: ignore[misc] final_optional: Final[Annotated[Optional[str], None]] # type: ignore[misc] diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index 4519f1ea83..fbfd08271b 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -14,7 +14,6 @@ INIT_FILE_NAME, ) - from dlt.destinations.path_utils import create_path, prepare_datetime_params from tests.load.filesystem.utils import perform_load from tests.utils import clean_test_storage, init_test_logging diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index c069f88a15..270e1ff70c 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -47,6 +47,7 @@ def test_filesystem_configuration() -> None: "credentials": None, "client_kwargs": None, "kwargs": None, + "deltalake_storage_options": None, } @@ -145,6 +146,7 @@ def test_filesystem_configuration_with_additional_arguments() -> None: bucket_url="az://root", kwargs={"use_ssl": True}, client_kwargs={"verify": "public.crt"}, + deltalake_storage_options={"AWS_S3_LOCKING_PROVIDER": "dynamodb"}, ) assert dict(config) == { "read_only": False, @@ -152,6 +154,7 @@ def test_filesystem_configuration_with_additional_arguments() -> None: "credentials": None, "kwargs": {"use_ssl": True}, "client_kwargs": {"verify": "public.crt"}, + "deltalake_storage_options": {"AWS_S3_LOCKING_PROVIDER": "dynamodb"}, } diff --git a/tests/load/filesystem/test_object_store_rs_credentials.py b/tests/load/filesystem/test_object_store_rs_credentials.py new file mode 100644 index 0000000000..4e43b7c5d8 --- /dev/null +++ b/tests/load/filesystem/test_object_store_rs_credentials.py @@ -0,0 +1,148 @@ +"""Tests translation of `dlt` credentials into `object_store` Rust crate credentials.""" + +from typing import Any, Dict, cast + +import pytest +from deltalake import DeltaTable +from deltalake.exceptions import TableNotFoundError + +import dlt +from dlt.common.typing import TSecretStrValue +from dlt.common.configuration import resolve_configuration +from dlt.common.configuration.specs import ( + AnyAzureCredentials, + AzureServicePrincipalCredentialsWithoutDefaults, + AzureCredentialsWithoutDefaults, + AwsCredentials, + AwsCredentialsWithoutDefaults, + GcpServiceAccountCredentialsWithoutDefaults, + GcpOAuthCredentialsWithoutDefaults, +) + +from tests.load.utils import AZ_BUCKET, AWS_BUCKET, GCS_BUCKET, ALL_FILESYSTEM_DRIVERS + +if all(driver not in ALL_FILESYSTEM_DRIVERS for driver in ("az", "s3", "gs")): + pytest.skip( + "Requires at least one of `az`, `s3`, `gs` in `ALL_FILESYSTEM_DRIVERS`.", + allow_module_level=True, + ) + + +FS_CREDS: Dict[str, Any] = dlt.secrets.get("destination.filesystem.credentials") +assert ( + FS_CREDS is not None +), "`destination.filesystem.credentials` must be configured for these tests." + + +def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> bool: + """Returns True if client can connect to object store, False otherwise. + + Uses `deltatable` library as Python interface to `object_store` Rust crate. + """ + try: + DeltaTable( + bucket_url, + storage_options=object_store_rs_credentials, + ) + except TableNotFoundError: + # this error implies the connection was succesful + # there is no Delta table at `bucket_url` + return True + return False + + +@pytest.mark.skipif( + "az" not in ALL_FILESYSTEM_DRIVERS, reason="`az` not in `ALL_FILESYSTEM_DRIVERS`" +) +def test_azure_object_store_rs_credentials() -> None: + creds: AnyAzureCredentials + + creds = AzureServicePrincipalCredentialsWithoutDefaults( + **dlt.secrets.get("destination.fsazureprincipal.credentials") + ) + assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + + # without SAS token + creds = AzureCredentialsWithoutDefaults( + azure_storage_account_name=FS_CREDS["azure_storage_account_name"], + azure_storage_account_key=FS_CREDS["azure_storage_account_key"], + ) + assert creds.azure_storage_sas_token is None + assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + + # with SAS token + creds = resolve_configuration(creds) + assert creds.azure_storage_sas_token is not None + assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + + +@pytest.mark.skipif( + "s3" not in ALL_FILESYSTEM_DRIVERS, reason="`s3` not in `ALL_FILESYSTEM_DRIVERS`" +) +def test_aws_object_store_rs_credentials() -> None: + creds: AwsCredentialsWithoutDefaults + + # AwsCredentials: no user-provided session token + creds = AwsCredentials( + aws_access_key_id=FS_CREDS["aws_access_key_id"], + aws_secret_access_key=FS_CREDS["aws_secret_access_key"], + region_name=FS_CREDS["region_name"], + ) + assert creds.aws_session_token is None + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None # auto-generated token + assert can_connect(AWS_BUCKET, object_store_rs_creds) + + # AwsCredentials: user-provided session token + # use previous credentials to create session token for new credentials + sess_creds = creds.to_session_credentials() + creds = AwsCredentials( + aws_access_key_id=sess_creds["aws_access_key_id"], + aws_secret_access_key=cast(TSecretStrValue, sess_creds["aws_secret_access_key"]), + aws_session_token=cast(TSecretStrValue, sess_creds["aws_session_token"]), + region_name=FS_CREDS["region_name"], + ) + assert creds.aws_session_token is not None + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None + assert can_connect(AWS_BUCKET, object_store_rs_creds) + + # AwsCredentialsWithoutDefaults: no user-provided session token + creds = AwsCredentialsWithoutDefaults( + aws_access_key_id=FS_CREDS["aws_access_key_id"], + aws_secret_access_key=FS_CREDS["aws_secret_access_key"], + region_name=FS_CREDS["region_name"], + ) + assert creds.aws_session_token is None + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert "aws_session_token" not in object_store_rs_creds # no auto-generated token + assert can_connect(AWS_BUCKET, object_store_rs_creds) + + # AwsCredentialsWithoutDefaults: user-provided session token + creds = AwsCredentialsWithoutDefaults( + aws_access_key_id=sess_creds["aws_access_key_id"], + aws_secret_access_key=cast(TSecretStrValue, sess_creds["aws_secret_access_key"]), + aws_session_token=cast(TSecretStrValue, sess_creds["aws_session_token"]), + region_name=FS_CREDS["region_name"], + ) + assert creds.aws_session_token is not None + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None + assert can_connect(AWS_BUCKET, object_store_rs_creds) + + +@pytest.mark.skipif( + "gs" not in ALL_FILESYSTEM_DRIVERS, reason="`gs` not in `ALL_FILESYSTEM_DRIVERS`" +) +def test_gcp_object_store_rs_credentials() -> None: + creds = GcpServiceAccountCredentialsWithoutDefaults( + project_id=FS_CREDS["project_id"], + private_key=FS_CREDS["private_key"], + private_key_id=FS_CREDS["private_key_id"], + client_email=FS_CREDS["client_email"], + ) + assert can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) + + # GcpOAuthCredentialsWithoutDefaults is currently not supported + with pytest.raises(NotImplementedError): + GcpOAuthCredentialsWithoutDefaults().to_object_store_rs_credentials() diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 623284d8a7..efbdc082f1 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -12,11 +12,11 @@ from dlt.common import pendulum from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.storages.load_storage import LoadJobInfo from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient +from dlt.pipeline.exceptions import PipelineStepFailed -from tests.cases import arrow_table_all_data_types +from tests.cases import arrow_table_all_data_types, table_update_and_row, assert_all_data_types_row from tests.common.utils import load_json_case from tests.utils import ALL_TEST_DATA_ITEM_FORMATS, TestDataItemFormat, skip_if_not_active from dlt.destinations.path_utils import create_path @@ -25,12 +25,18 @@ DestinationTestConfiguration, ) -from tests.pipeline.utils import load_table_counts +from tests.pipeline.utils import load_table_counts, assert_load_info, load_tables_to_dicts skip_if_not_active("filesystem") +@pytest.fixture +def local_filesystem_pipeline() -> dlt.Pipeline: + os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "_storage" + return dlt.pipeline(pipeline_name="fs_pipe", destination="filesystem", full_refresh=True) + + def test_pipeline_merge_write_disposition(default_buckets_env: str) -> None: """Run pipeline twice with merge write disposition Regardless wether primary key is set or not, filesystem appends @@ -216,6 +222,244 @@ def some_source(): assert table.column("value").to_pylist() == [1, 2, 3, 4, 5] +@pytest.mark.essential +def test_delta_table_core( + default_buckets_env: str, + local_filesystem_pipeline: dlt.Pipeline, +) -> None: + """Tests core functionality for `delta` table format. + + Tests all data types, all filesystems, all write dispositions. + """ + + from tests.pipeline.utils import _get_delta_table + + if default_buckets_env.startswith("memory://"): + pytest.skip( + "`deltalake` library does not support `memory` protocol (write works, read doesn't)" + ) + if default_buckets_env.startswith("s3://"): + # https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/ + os.environ["DESTINATION__FILESYSTEM__DELTALAKE_STORAGE_OPTIONS"] = ( + '{"AWS_S3_ALLOW_UNSAFE_RENAME": "true"}' + ) + + # create resource that yields rows with all data types + column_schemas, row = table_update_and_row() + + @dlt.resource(columns=column_schemas, table_format="delta") + def data_types(): + nonlocal row + yield [row] * 10 + + # run pipeline, this should create Delta table + info = local_filesystem_pipeline.run(data_types()) + assert_load_info(info) + + # `delta` table format should use `parquet` file format + completed_jobs = info.load_packages[0].jobs["completed_jobs"] + data_types_jobs = [ + job for job in completed_jobs if job.job_file_info.table_name == "data_types" + ] + assert all([job.file_path.endswith((".parquet", ".reference")) for job in data_types_jobs]) + + # 10 rows should be loaded to the Delta table and the content of the first + # row should match expected values + rows = load_tables_to_dicts(local_filesystem_pipeline, "data_types", exclude_system_cols=True)[ + "data_types" + ] + assert len(rows) == 10 + assert_all_data_types_row(rows[0], schema=column_schemas) + + # another run should append rows to the table + info = local_filesystem_pipeline.run(data_types()) + assert_load_info(info) + rows = load_tables_to_dicts(local_filesystem_pipeline, "data_types", exclude_system_cols=True)[ + "data_types" + ] + assert len(rows) == 20 + + # ensure "replace" write disposition is handled + # should do logical replace, increasing the table version + info = local_filesystem_pipeline.run(data_types(), write_disposition="replace") + assert_load_info(info) + client = cast(FilesystemClient, local_filesystem_pipeline.destination_client()) + assert _get_delta_table(client, "data_types").version() == 2 + rows = load_tables_to_dicts(local_filesystem_pipeline, "data_types", exclude_system_cols=True)[ + "data_types" + ] + assert len(rows) == 10 + + # `merge` resolves to `append` behavior + info = local_filesystem_pipeline.run(data_types(), write_disposition="merge") + assert_load_info(info) + rows = load_tables_to_dicts(local_filesystem_pipeline, "data_types", exclude_system_cols=True)[ + "data_types" + ] + assert len(rows) == 20 + + +def test_delta_table_multiple_files( + local_filesystem_pipeline: dlt.Pipeline, +) -> None: + """Tests loading multiple files into a Delta table. + + Files should be loaded into the Delta table in a single commit. + """ + + from tests.pipeline.utils import _get_delta_table + + os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "2" # force multiple files + + @dlt.resource(table_format="delta") + def delta_table(): + yield [{"foo": True}] * 10 + + info = local_filesystem_pipeline.run(delta_table()) + assert_load_info(info) + + # multiple Parquet files should have been created + completed_jobs = info.load_packages[0].jobs["completed_jobs"] + delta_table_parquet_jobs = [ + job + for job in completed_jobs + if job.job_file_info.table_name == "delta_table" + and job.job_file_info.file_format == "parquet" + ] + assert len(delta_table_parquet_jobs) == 5 # 10 records, max 2 per file + + # all 10 records should have been loaded into a Delta table in a single commit + client = cast(FilesystemClient, local_filesystem_pipeline.destination_client()) + assert _get_delta_table(client, "delta_table").version() == 0 + rows = load_tables_to_dicts(local_filesystem_pipeline, "delta_table", exclude_system_cols=True)[ + "delta_table" + ] + assert len(rows) == 10 + + +def test_delta_table_child_tables( + local_filesystem_pipeline: dlt.Pipeline, +) -> None: + """Tests child table handling for `delta` table format.""" + + @dlt.resource(table_format="delta") + def complex_table(): + yield [ + { + "foo": 1, + "child": [{"bar": True, "grandchild": [1, 2]}, {"bar": True, "grandchild": [1]}], + }, + { + "foo": 2, + "child": [ + {"bar": False, "grandchild": [1, 3]}, + ], + }, + ] + + info = local_filesystem_pipeline.run(complex_table()) + assert_load_info(info) + rows_dict = load_tables_to_dicts( + local_filesystem_pipeline, + "complex_table", + "complex_table__child", + "complex_table__child__grandchild", + exclude_system_cols=True, + ) + # assert row counts + assert len(rows_dict["complex_table"]) == 2 + assert len(rows_dict["complex_table__child"]) == 3 + assert len(rows_dict["complex_table__child__grandchild"]) == 5 + # assert column names + assert rows_dict["complex_table"][0].keys() == {"foo"} + assert rows_dict["complex_table__child"][0].keys() == {"bar"} + assert rows_dict["complex_table__child__grandchild"][0].keys() == {"value"} + + # test write disposition handling with child tables + info = local_filesystem_pipeline.run(complex_table()) + assert_load_info(info) + rows_dict = load_tables_to_dicts( + local_filesystem_pipeline, + "complex_table", + "complex_table__child", + "complex_table__child__grandchild", + exclude_system_cols=True, + ) + assert len(rows_dict["complex_table"]) == 2 * 2 + assert len(rows_dict["complex_table__child"]) == 3 * 2 + assert len(rows_dict["complex_table__child__grandchild"]) == 5 * 2 + + info = local_filesystem_pipeline.run(complex_table(), write_disposition="replace") + assert_load_info(info) + rows_dict = load_tables_to_dicts( + local_filesystem_pipeline, + "complex_table", + "complex_table__child", + "complex_table__child__grandchild", + exclude_system_cols=True, + ) + assert len(rows_dict["complex_table"]) == 2 + assert len(rows_dict["complex_table__child"]) == 3 + assert len(rows_dict["complex_table__child__grandchild"]) == 5 + + +def test_delta_table_mixed_source( + local_filesystem_pipeline: dlt.Pipeline, +) -> None: + """Tests file format handling in mixed source. + + One resource uses `delta` table format, the other doesn't. + """ + + @dlt.resource(table_format="delta") + def delta_table(): + yield [{"foo": True}] + + @dlt.resource() + def non_delta_table(): + yield [1, 2, 3] + + @dlt.source + def s(): + return [delta_table(), non_delta_table()] + + info = local_filesystem_pipeline.run( + s(), loader_file_format="jsonl" + ) # set file format at pipeline level + assert_load_info(info) + completed_jobs = info.load_packages[0].jobs["completed_jobs"] + + # `jsonl` file format should be overridden for `delta_table` resource + # because it's not supported for `delta` table format + delta_table_jobs = [ + job for job in completed_jobs if job.job_file_info.table_name == "delta_table" + ] + assert all([job.file_path.endswith((".parquet", ".reference")) for job in delta_table_jobs]) + + # `jsonl` file format should be respected for `non_delta_table` resource + non_delta_table_job = [ + job for job in completed_jobs if job.job_file_info.table_name == "non_delta_table" + ][0] + assert non_delta_table_job.file_path.endswith(".jsonl") + + +def test_delta_table_dynamic_dispatch( + local_filesystem_pipeline: dlt.Pipeline, +) -> None: + @dlt.resource(primary_key="id", table_name=lambda i: i["type"], table_format="delta") + def github_events(): + with open( + "tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8" + ) as f: + yield json.load(f) + + info = local_filesystem_pipeline.run(github_events()) + assert_load_info(info) + completed_jobs = info.load_packages[0].jobs["completed_jobs"] + # 20 event types, two jobs per table (.parquet and .reference), 1 job for _dlt_pipeline_state + assert len(completed_jobs) == 2 * 20 + 1 + + TEST_LAYOUTS = ( "{schema_name}/{table_name}/{load_id}.{file_id}.{ext}", "{schema_name}.{table_name}.{load_id}.{file_id}.{ext}", diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index b4dae919f8..7affcc5a81 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -1,5 +1,4 @@ -import posixpath -from typing import Any, Dict, List, Tuple, Callable, Sequence +from typing import Any, Dict, List, Callable, Sequence import pytest import random from os import environ @@ -8,7 +7,7 @@ import dlt from dlt.common import json, sleep from dlt.common.pipeline import LoadInfo -from dlt.common.schema.typing import LOADS_TABLE_NAME +from dlt.common.schema.utils import get_table_format from dlt.common.typing import DictStrAny from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from dlt.destinations.fs_client import FSClientBase @@ -16,7 +15,6 @@ from dlt.common.storages import FileStorage from dlt.destinations.exceptions import DatabaseUndefinedRelation -from tests.utils import TEST_STORAGE_ROOT PIPELINE_TEST_CASES_PATH = "./tests/pipeline/cases/" @@ -156,19 +154,39 @@ def _load_file(client: FSClientBase, filepath) -> List[Dict[str, Any]]: # # Load table dicts # +def _get_delta_table(client: FilesystemClient, table_name: str) -> "DeltaTable": # type: ignore[name-defined] # noqa: F821 + from deltalake import DeltaTable + from dlt.common.libs.deltalake import _deltalake_storage_options + + table_dir = client.get_table_dir(table_name) + remote_table_dir = f"{client.config.protocol}://{table_dir}" + return DeltaTable( + remote_table_dir, + storage_options=_deltalake_storage_options(client.config), + ) + + def _load_tables_to_dicts_fs(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" client = p._fs_client() result: Dict[str, Any] = {} for table_name in table_names: - table_files = client.list_table_files(table_name) - for file in table_files: - items = _load_file(client, file) - if table_name in result: - result[table_name] = result[table_name] + items - else: - result[table_name] = items + if ( + table_name in p.default_schema.data_table_names() + and get_table_format(p.default_schema.tables, table_name) == "delta" + ): + assert isinstance(client, FilesystemClient) + dt = _get_delta_table(client, table_name) + result[table_name] = dt.to_pyarrow_table().to_pylist() + else: + table_files = client.list_table_files(table_name) + for file in table_files: + items = _load_file(client, file) + if table_name in result: + result[table_name] = result[table_name] + items + else: + result[table_name] = items return result @@ -194,11 +212,29 @@ def _load_tables_to_dicts_sql( def load_tables_to_dicts( - p: dlt.Pipeline, *table_names: str, schema_name: str = None + p: dlt.Pipeline, + *table_names: str, + schema_name: str = None, + exclude_system_cols: bool = False, + sortkey: str = None, ) -> Dict[str, List[Dict[str, Any]]]: + def _exclude_system_cols(dict_: Dict[str, Any]) -> Dict[str, Any]: + return {k: v for k, v in dict_.items() if not k.startswith("_dlt")} + + def _sort_list_of_dicts(list_: List[Dict[str, Any]], sortkey: str) -> List[Dict[str, Any]]: + """Sort list of dictionaries by dictionary key.""" + return sorted(list_, key=lambda d: d[sortkey]) + if _is_filesystem(p): - return _load_tables_to_dicts_fs(p, *table_names) - return _load_tables_to_dicts_sql(p, *table_names, schema_name=schema_name) + result = _load_tables_to_dicts_fs(p, *table_names) + else: + result = _load_tables_to_dicts_sql(p, *table_names, schema_name=schema_name) + + if exclude_system_cols: + result = {k: [_exclude_system_cols(d) for d in v] for k, v in result.items()} + if sortkey is not None: + result = {k: _sort_list_of_dicts(v, sortkey) for k, v in result.items()} + return result def assert_only_table_columns( From fa9b5fdd00dd4ce784515becc92fb8f42a68bd61 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Thu, 6 Jun 2024 23:04:57 +0200 Subject: [PATCH 12/61] bump node version used in docs (#1445) --- docs/website/package-lock.json | 925 ++++++++++++++++++++++----------- docs/website/package.json | 6 +- 2 files changed, 611 insertions(+), 320 deletions(-) diff --git a/docs/website/package-lock.json b/docs/website/package-lock.json index 3fa5ec429d..f99a70dbf7 100644 --- a/docs/website/package-lock.json +++ b/docs/website/package-lock.json @@ -8,8 +8,8 @@ "name": "dlt-docs", "version": "0.0.0", "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/preset-classic": "2.4.1", + "@docusaurus/core": "2.4.3", + "@docusaurus/preset-classic": "2.4.3", "@mdx-js/react": "^1.6.22", "clsx": "^1.2.1", "dedent": "^1.5.1", @@ -29,7 +29,7 @@ "concurrently": "^8.2.2" }, "engines": { - "node": ">=16.14" + "node": ">=20.10" } }, "node_modules/@algolia/autocomplete-core": { @@ -74,74 +74,74 @@ } }, "node_modules/@algolia/cache-browser-local-storage": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/cache-browser-local-storage/-/cache-browser-local-storage-4.19.1.tgz", - "integrity": "sha512-FYAZWcGsFTTaSAwj9Std8UML3Bu8dyWDncM7Ls8g+58UOe4XYdlgzXWbrIgjaguP63pCCbMoExKr61B+ztK3tw==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/cache-browser-local-storage/-/cache-browser-local-storage-4.23.3.tgz", + "integrity": "sha512-vRHXYCpPlTDE7i6UOy2xE03zHF2C8MEFjPN2v7fRbqVpcOvAUQK81x3Kc21xyb5aSIpYCjWCZbYZuz8Glyzyyg==", "dependencies": { - "@algolia/cache-common": "4.19.1" + "@algolia/cache-common": "4.23.3" } }, "node_modules/@algolia/cache-common": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/cache-common/-/cache-common-4.19.1.tgz", - "integrity": "sha512-XGghi3l0qA38HiqdoUY+wvGyBsGvKZ6U3vTiMBT4hArhP3fOGLXpIINgMiiGjTe4FVlTa5a/7Zf2bwlIHfRqqg==" + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/cache-common/-/cache-common-4.23.3.tgz", + "integrity": "sha512-h9XcNI6lxYStaw32pHpB1TMm0RuxphF+Ik4o7tcQiodEdpKK+wKufY6QXtba7t3k8eseirEMVB83uFFF3Nu54A==" }, "node_modules/@algolia/cache-in-memory": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/cache-in-memory/-/cache-in-memory-4.19.1.tgz", - "integrity": "sha512-+PDWL+XALGvIginigzu8oU6eWw+o76Z8zHbBovWYcrtWOEtinbl7a7UTt3x3lthv+wNuFr/YD1Gf+B+A9V8n5w==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/cache-in-memory/-/cache-in-memory-4.23.3.tgz", + "integrity": "sha512-yvpbuUXg/+0rbcagxNT7un0eo3czx2Uf0y4eiR4z4SD7SiptwYTpbuS0IHxcLHG3lq22ukx1T6Kjtk/rT+mqNg==", "dependencies": { - "@algolia/cache-common": "4.19.1" + "@algolia/cache-common": "4.23.3" } }, "node_modules/@algolia/client-account": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/client-account/-/client-account-4.19.1.tgz", - "integrity": "sha512-Oy0ritA2k7AMxQ2JwNpfaEcgXEDgeyKu0V7E7xt/ZJRdXfEpZcwp9TOg4TJHC7Ia62gIeT2Y/ynzsxccPw92GA==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/client-account/-/client-account-4.23.3.tgz", + "integrity": "sha512-hpa6S5d7iQmretHHF40QGq6hz0anWEHGlULcTIT9tbUssWUriN9AUXIFQ8Ei4w9azD0hc1rUok9/DeQQobhQMA==", "dependencies": { - "@algolia/client-common": "4.19.1", - "@algolia/client-search": "4.19.1", - "@algolia/transporter": "4.19.1" + "@algolia/client-common": "4.23.3", + "@algolia/client-search": "4.23.3", + "@algolia/transporter": "4.23.3" } }, "node_modules/@algolia/client-analytics": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/client-analytics/-/client-analytics-4.19.1.tgz", - "integrity": "sha512-5QCq2zmgdZLIQhHqwl55ZvKVpLM3DNWjFI4T+bHr3rGu23ew2bLO4YtyxaZeChmDb85jUdPDouDlCumGfk6wOg==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/client-analytics/-/client-analytics-4.23.3.tgz", + "integrity": "sha512-LBsEARGS9cj8VkTAVEZphjxTjMVCci+zIIiRhpFun9jGDUlS1XmhCW7CTrnaWeIuCQS/2iPyRqSy1nXPjcBLRA==", "dependencies": { - "@algolia/client-common": "4.19.1", - "@algolia/client-search": "4.19.1", - "@algolia/requester-common": "4.19.1", - "@algolia/transporter": "4.19.1" + "@algolia/client-common": "4.23.3", + "@algolia/client-search": "4.23.3", + "@algolia/requester-common": "4.23.3", + "@algolia/transporter": "4.23.3" } }, "node_modules/@algolia/client-common": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/client-common/-/client-common-4.19.1.tgz", - "integrity": "sha512-3kAIVqTcPrjfS389KQvKzliC559x+BDRxtWamVJt8IVp7LGnjq+aVAXg4Xogkur1MUrScTZ59/AaUd5EdpyXgA==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/client-common/-/client-common-4.23.3.tgz", + "integrity": "sha512-l6EiPxdAlg8CYhroqS5ybfIczsGUIAC47slLPOMDeKSVXYG1n0qGiz4RjAHLw2aD0xzh2EXZ7aRguPfz7UKDKw==", "dependencies": { - "@algolia/requester-common": "4.19.1", - "@algolia/transporter": "4.19.1" + "@algolia/requester-common": "4.23.3", + "@algolia/transporter": "4.23.3" } }, "node_modules/@algolia/client-personalization": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/client-personalization/-/client-personalization-4.19.1.tgz", - "integrity": "sha512-8CWz4/H5FA+krm9HMw2HUQenizC/DxUtsI5oYC0Jxxyce1vsr8cb1aEiSJArQT6IzMynrERif1RVWLac1m36xw==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/client-personalization/-/client-personalization-4.23.3.tgz", + "integrity": "sha512-3E3yF3Ocr1tB/xOZiuC3doHQBQ2zu2MPTYZ0d4lpfWads2WTKG7ZzmGnsHmm63RflvDeLK/UVx7j2b3QuwKQ2g==", "dependencies": { - "@algolia/client-common": "4.19.1", - "@algolia/requester-common": "4.19.1", - "@algolia/transporter": "4.19.1" + "@algolia/client-common": "4.23.3", + "@algolia/requester-common": "4.23.3", + "@algolia/transporter": "4.23.3" } }, "node_modules/@algolia/client-search": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/client-search/-/client-search-4.19.1.tgz", - "integrity": "sha512-mBecfMFS4N+yK/p0ZbK53vrZbL6OtWMk8YmnOv1i0LXx4pelY8TFhqKoTit3NPVPwoSNN0vdSN9dTu1xr1XOVw==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/client-search/-/client-search-4.23.3.tgz", + "integrity": "sha512-P4VAKFHqU0wx9O+q29Q8YVuaowaZ5EM77rxfmGnkHUJggh28useXQdopokgwMeYw2XUht49WX5RcTQ40rZIabw==", "dependencies": { - "@algolia/client-common": "4.19.1", - "@algolia/requester-common": "4.19.1", - "@algolia/transporter": "4.19.1" + "@algolia/client-common": "4.23.3", + "@algolia/requester-common": "4.23.3", + "@algolia/transporter": "4.23.3" } }, "node_modules/@algolia/events": { @@ -150,47 +150,65 @@ "integrity": "sha512-FQzvOCgoFXAbf5Y6mYozw2aj5KCJoA3m4heImceldzPSMbdyS4atVjJzXKMsfX3wnZTFYwkkt8/z8UesLHlSBQ==" }, "node_modules/@algolia/logger-common": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/logger-common/-/logger-common-4.19.1.tgz", - "integrity": "sha512-i6pLPZW/+/YXKis8gpmSiNk1lOmYCmRI6+x6d2Qk1OdfvX051nRVdalRbEcVTpSQX6FQAoyeaui0cUfLYW5Elw==" + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/logger-common/-/logger-common-4.23.3.tgz", + "integrity": "sha512-y9kBtmJwiZ9ZZ+1Ek66P0M68mHQzKRxkW5kAAXYN/rdzgDN0d2COsViEFufxJ0pb45K4FRcfC7+33YB4BLrZ+g==" }, "node_modules/@algolia/logger-console": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/logger-console/-/logger-console-4.19.1.tgz", - "integrity": "sha512-jj72k9GKb9W0c7TyC3cuZtTr0CngLBLmc8trzZlXdfvQiigpUdvTi1KoWIb2ZMcRBG7Tl8hSb81zEY3zI2RlXg==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/logger-console/-/logger-console-4.23.3.tgz", + "integrity": "sha512-8xoiseoWDKuCVnWP8jHthgaeobDLolh00KJAdMe9XPrWPuf1by732jSpgy2BlsLTaT9m32pHI8CRfrOqQzHv3A==", "dependencies": { - "@algolia/logger-common": "4.19.1" + "@algolia/logger-common": "4.23.3" + } + }, + "node_modules/@algolia/recommend": { + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/recommend/-/recommend-4.23.3.tgz", + "integrity": "sha512-9fK4nXZF0bFkdcLBRDexsnGzVmu4TSYZqxdpgBW2tEyfuSSY54D4qSRkLmNkrrz4YFvdh2GM1gA8vSsnZPR73w==", + "dependencies": { + "@algolia/cache-browser-local-storage": "4.23.3", + "@algolia/cache-common": "4.23.3", + "@algolia/cache-in-memory": "4.23.3", + "@algolia/client-common": "4.23.3", + "@algolia/client-search": "4.23.3", + "@algolia/logger-common": "4.23.3", + "@algolia/logger-console": "4.23.3", + "@algolia/requester-browser-xhr": "4.23.3", + "@algolia/requester-common": "4.23.3", + "@algolia/requester-node-http": "4.23.3", + "@algolia/transporter": "4.23.3" } }, "node_modules/@algolia/requester-browser-xhr": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/requester-browser-xhr/-/requester-browser-xhr-4.19.1.tgz", - "integrity": "sha512-09K/+t7lptsweRTueHnSnmPqIxbHMowejAkn9XIcJMLdseS3zl8ObnS5GWea86mu3vy4+8H+ZBKkUN82Zsq/zg==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/requester-browser-xhr/-/requester-browser-xhr-4.23.3.tgz", + "integrity": "sha512-jDWGIQ96BhXbmONAQsasIpTYWslyjkiGu0Quydjlowe+ciqySpiDUrJHERIRfELE5+wFc7hc1Q5hqjGoV7yghw==", "dependencies": { - "@algolia/requester-common": "4.19.1" + "@algolia/requester-common": "4.23.3" } }, "node_modules/@algolia/requester-common": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/requester-common/-/requester-common-4.19.1.tgz", - "integrity": "sha512-BisRkcWVxrDzF1YPhAckmi2CFYK+jdMT60q10d7z3PX+w6fPPukxHRnZwooiTUrzFe50UBmLItGizWHP5bDzVQ==" + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/requester-common/-/requester-common-4.23.3.tgz", + "integrity": "sha512-xloIdr/bedtYEGcXCiF2muajyvRhwop4cMZo+K2qzNht0CMzlRkm8YsDdj5IaBhshqfgmBb3rTg4sL4/PpvLYw==" }, "node_modules/@algolia/requester-node-http": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/requester-node-http/-/requester-node-http-4.19.1.tgz", - "integrity": "sha512-6DK52DHviBHTG2BK/Vv2GIlEw7i+vxm7ypZW0Z7vybGCNDeWzADx+/TmxjkES2h15+FZOqVf/Ja677gePsVItA==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/requester-node-http/-/requester-node-http-4.23.3.tgz", + "integrity": "sha512-zgu++8Uj03IWDEJM3fuNl34s746JnZOWn1Uz5taV1dFyJhVM/kTNw9Ik7YJWiUNHJQXcaD8IXD1eCb0nq/aByA==", "dependencies": { - "@algolia/requester-common": "4.19.1" + "@algolia/requester-common": "4.23.3" } }, "node_modules/@algolia/transporter": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/@algolia/transporter/-/transporter-4.19.1.tgz", - "integrity": "sha512-nkpvPWbpuzxo1flEYqNIbGz7xhfhGOKGAZS7tzC+TELgEmi7z99qRyTfNSUlW7LZmB3ACdnqAo+9A9KFBENviQ==", + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/@algolia/transporter/-/transporter-4.23.3.tgz", + "integrity": "sha512-Wjl5gttqnf/gQKJA+dafnD0Y6Yw97yvfY8R9h0dQltX1GXTgNs1zWgvtWW0tHl1EgMdhAyw189uWiZMnL3QebQ==", "dependencies": { - "@algolia/cache-common": "4.19.1", - "@algolia/logger-common": "4.19.1", - "@algolia/requester-common": "4.19.1" + "@algolia/cache-common": "4.23.3", + "@algolia/logger-common": "4.23.3", + "@algolia/requester-common": "4.23.3" } }, "node_modules/@ampproject/remapping": { @@ -2070,18 +2088,18 @@ } }, "node_modules/@docsearch/css": { - "version": "3.5.2", - "resolved": "https://registry.npmjs.org/@docsearch/css/-/css-3.5.2.tgz", - "integrity": "sha512-SPiDHaWKQZpwR2siD0KQUwlStvIAnEyK6tAE2h2Wuoq8ue9skzhlyVQ1ddzOxX6khULnAALDiR/isSF3bnuciA==" + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@docsearch/css/-/css-3.6.0.tgz", + "integrity": "sha512-+sbxb71sWre+PwDK7X2T8+bhS6clcVMLwBPznX45Qu6opJcgRjAp7gYSDzVFp187J+feSj5dNBN1mJoi6ckkUQ==" }, "node_modules/@docsearch/react": { - "version": "3.5.2", - "resolved": "https://registry.npmjs.org/@docsearch/react/-/react-3.5.2.tgz", - "integrity": "sha512-9Ahcrs5z2jq/DcAvYtvlqEBHImbm4YJI8M9y0x6Tqg598P40HTEkX7hsMcIuThI+hTFxRGZ9hll0Wygm2yEjng==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/@docsearch/react/-/react-3.6.0.tgz", + "integrity": "sha512-HUFut4ztcVNmqy9gp/wxNbC7pTOHhgVVkHVGCACTuLhUKUhKAF9KYHJtMiLUJxEqiFLQiuri1fWF8zqwM/cu1w==", "dependencies": { "@algolia/autocomplete-core": "1.9.3", "@algolia/autocomplete-preset-algolia": "1.9.3", - "@docsearch/css": "3.5.2", + "@docsearch/css": "3.6.0", "algoliasearch": "^4.19.1" }, "peerDependencies": { @@ -2106,9 +2124,9 @@ } }, "node_modules/@docusaurus/core": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/core/-/core-2.4.1.tgz", - "integrity": "sha512-SNsY7PshK3Ri7vtsLXVeAJGS50nJN3RgF836zkyUfAD01Fq+sAk5EwWgLw+nnm5KVNGDu7PRR2kRGDsWvqpo0g==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/core/-/core-2.4.3.tgz", + "integrity": "sha512-dWH5P7cgeNSIg9ufReX6gaCl/TmrGKD38Orbwuz05WPhAQtFXHd5B8Qym1TiXfvUNvwoYKkAJOJuGe8ou0Z7PA==", "dependencies": { "@babel/core": "^7.18.6", "@babel/generator": "^7.18.7", @@ -2120,13 +2138,13 @@ "@babel/runtime": "^7.18.6", "@babel/runtime-corejs3": "^7.18.6", "@babel/traverse": "^7.18.8", - "@docusaurus/cssnano-preset": "2.4.1", - "@docusaurus/logger": "2.4.1", - "@docusaurus/mdx-loader": "2.4.1", + "@docusaurus/cssnano-preset": "2.4.3", + "@docusaurus/logger": "2.4.3", + "@docusaurus/mdx-loader": "2.4.3", "@docusaurus/react-loadable": "5.5.2", - "@docusaurus/utils": "2.4.1", - "@docusaurus/utils-common": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "@docusaurus/utils": "2.4.3", + "@docusaurus/utils-common": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "@slorber/static-site-generator-webpack-plugin": "^4.0.7", "@svgr/webpack": "^6.2.1", "autoprefixer": "^10.4.7", @@ -2194,9 +2212,9 @@ } }, "node_modules/@docusaurus/cssnano-preset": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/cssnano-preset/-/cssnano-preset-2.4.1.tgz", - "integrity": "sha512-ka+vqXwtcW1NbXxWsh6yA1Ckii1klY9E53cJ4O9J09nkMBgrNX3iEFED1fWdv8wf4mJjvGi5RLZ2p9hJNjsLyQ==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/cssnano-preset/-/cssnano-preset-2.4.3.tgz", + "integrity": "sha512-ZvGSRCi7z9wLnZrXNPG6DmVPHdKGd8dIn9pYbEOFiYihfv4uDR3UtxogmKf+rT8ZlKFf5Lqne8E8nt08zNM8CA==", "dependencies": { "cssnano-preset-advanced": "^5.3.8", "postcss": "^8.4.14", @@ -2208,9 +2226,9 @@ } }, "node_modules/@docusaurus/logger": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/logger/-/logger-2.4.1.tgz", - "integrity": "sha512-5h5ysIIWYIDHyTVd8BjheZmQZmEgWDR54aQ1BX9pjFfpyzFo5puKXKYrYJXbjEHGyVhEzmB9UXwbxGfaZhOjcg==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/logger/-/logger-2.4.3.tgz", + "integrity": "sha512-Zxws7r3yLufk9xM1zq9ged0YHs65mlRmtsobnFkdZTxWXdTYlWWLWdKyNKAsVC+D7zg+pv2fGbyabdOnyZOM3w==", "dependencies": { "chalk": "^4.1.2", "tslib": "^2.4.0" @@ -2220,14 +2238,14 @@ } }, "node_modules/@docusaurus/mdx-loader": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/mdx-loader/-/mdx-loader-2.4.1.tgz", - "integrity": "sha512-4KhUhEavteIAmbBj7LVFnrVYDiU51H5YWW1zY6SmBSte/YLhDutztLTBE0PQl1Grux1jzUJeaSvAzHpTn6JJDQ==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/mdx-loader/-/mdx-loader-2.4.3.tgz", + "integrity": "sha512-b1+fDnWtl3GiqkL0BRjYtc94FZrcDDBV1j8446+4tptB9BAOlePwG2p/pK6vGvfL53lkOsszXMghr2g67M0vCw==", "dependencies": { "@babel/parser": "^7.18.8", "@babel/traverse": "^7.18.8", - "@docusaurus/logger": "2.4.1", - "@docusaurus/utils": "2.4.1", + "@docusaurus/logger": "2.4.3", + "@docusaurus/utils": "2.4.3", "@mdx-js/mdx": "^1.6.22", "escape-html": "^1.0.3", "file-loader": "^6.2.0", @@ -2254,6 +2272,7 @@ "version": "2.4.1", "resolved": "https://registry.npmjs.org/@docusaurus/module-type-aliases/-/module-type-aliases-2.4.1.tgz", "integrity": "sha512-gLBuIFM8Dp2XOCWffUDSjtxY7jQgKvYujt7Mx5s4FCTfoL5dN1EVbnrn+O2Wvh8b0a77D57qoIDY7ghgmatR1A==", + "dev": true, "dependencies": { "@docusaurus/react-loadable": "5.5.2", "@docusaurus/types": "2.4.1", @@ -2270,17 +2289,17 @@ } }, "node_modules/@docusaurus/plugin-content-blog": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/plugin-content-blog/-/plugin-content-blog-2.4.1.tgz", - "integrity": "sha512-E2i7Knz5YIbE1XELI6RlTnZnGgS52cUO4BlCiCUCvQHbR+s1xeIWz4C6BtaVnlug0Ccz7nFSksfwDpVlkujg5Q==", - "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/logger": "2.4.1", - "@docusaurus/mdx-loader": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils": "2.4.1", - "@docusaurus/utils-common": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-content-blog/-/plugin-content-blog-2.4.3.tgz", + "integrity": "sha512-PVhypqaA0t98zVDpOeTqWUTvRqCEjJubtfFUQ7zJNYdbYTbS/E/ytq6zbLVsN/dImvemtO/5JQgjLxsh8XLo8Q==", + "dependencies": { + "@docusaurus/core": "2.4.3", + "@docusaurus/logger": "2.4.3", + "@docusaurus/mdx-loader": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils": "2.4.3", + "@docusaurus/utils-common": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "cheerio": "^1.0.0-rc.12", "feed": "^4.2.2", "fs-extra": "^10.1.0", @@ -2299,18 +2318,37 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, - "node_modules/@docusaurus/plugin-content-docs": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/plugin-content-docs/-/plugin-content-docs-2.4.1.tgz", - "integrity": "sha512-Lo7lSIcpswa2Kv4HEeUcGYqaasMUQNpjTXpV0N8G6jXgZaQurqp7E8NGYeGbDXnb48czmHWbzDL4S3+BbK0VzA==", + "node_modules/@docusaurus/plugin-content-blog/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/logger": "2.4.1", - "@docusaurus/mdx-loader": "2.4.1", - "@docusaurus/module-type-aliases": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, + "node_modules/@docusaurus/plugin-content-docs": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-content-docs/-/plugin-content-docs-2.4.3.tgz", + "integrity": "sha512-N7Po2LSH6UejQhzTCsvuX5NOzlC+HiXOVvofnEPj0WhMu1etpLEXE6a4aTxrtg95lQ5kf0xUIdjX9sh3d3G76A==", + "dependencies": { + "@docusaurus/core": "2.4.3", + "@docusaurus/logger": "2.4.3", + "@docusaurus/mdx-loader": "2.4.3", + "@docusaurus/module-type-aliases": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "@types/react-router-config": "^5.0.6", "combine-promises": "^1.1.0", "fs-extra": "^10.1.0", @@ -2329,16 +2367,54 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, - "node_modules/@docusaurus/plugin-content-pages": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/plugin-content-pages/-/plugin-content-pages-2.4.1.tgz", - "integrity": "sha512-/UjuH/76KLaUlL+o1OvyORynv6FURzjurSjvn2lbWTFc4tpYY2qLYTlKpTCBVPhlLUQsfyFnshEJDLmPneq2oA==", + "node_modules/@docusaurus/plugin-content-docs/node_modules/@docusaurus/module-type-aliases": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/module-type-aliases/-/module-type-aliases-2.4.3.tgz", + "integrity": "sha512-cwkBkt1UCiduuvEAo7XZY01dJfRn7UR/75mBgOdb1hKknhrabJZ8YH+7savd/y9kLExPyrhe0QwdS9GuzsRRIA==", "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/mdx-loader": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "@docusaurus/react-loadable": "5.5.2", + "@docusaurus/types": "2.4.3", + "@types/history": "^4.7.11", + "@types/react": "*", + "@types/react-router-config": "*", + "@types/react-router-dom": "*", + "react-helmet-async": "*", + "react-loadable": "npm:@docusaurus/react-loadable@5.5.2" + }, + "peerDependencies": { + "react": "*", + "react-dom": "*" + } + }, + "node_modules/@docusaurus/plugin-content-docs/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, + "node_modules/@docusaurus/plugin-content-pages": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-content-pages/-/plugin-content-pages-2.4.3.tgz", + "integrity": "sha512-txtDVz7y3zGk67q0HjG0gRttVPodkHqE0bpJ+7dOaTH40CQFLSh7+aBeGnPOTl+oCPG+hxkim4SndqPqXjQ8Bg==", + "dependencies": { + "@docusaurus/core": "2.4.3", + "@docusaurus/mdx-loader": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "fs-extra": "^10.1.0", "tslib": "^2.4.0", "webpack": "^5.73.0" @@ -2351,14 +2427,33 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, + "node_modules/@docusaurus/plugin-content-pages/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, "node_modules/@docusaurus/plugin-debug": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/plugin-debug/-/plugin-debug-2.4.1.tgz", - "integrity": "sha512-7Yu9UPzRShlrH/G8btOpR0e6INFZr0EegWplMjOqelIwAcx3PKyR8mgPTxGTxcqiYj6hxSCRN0D8R7YrzImwNA==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-debug/-/plugin-debug-2.4.3.tgz", + "integrity": "sha512-LkUbuq3zCmINlFb+gAd4ZvYr+bPAzMC0hwND4F7V9bZ852dCX8YoWyovVUBKq4er1XsOwSQaHmNGtObtn8Av8Q==", "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils": "2.4.1", + "@docusaurus/core": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils": "2.4.3", "fs-extra": "^10.1.0", "react-json-view": "^1.21.3", "tslib": "^2.4.0" @@ -2371,14 +2466,33 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, + "node_modules/@docusaurus/plugin-debug/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, "node_modules/@docusaurus/plugin-google-analytics": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/plugin-google-analytics/-/plugin-google-analytics-2.4.1.tgz", - "integrity": "sha512-dyZJdJiCoL+rcfnm0RPkLt/o732HvLiEwmtoNzOoz9MSZz117UH2J6U2vUDtzUzwtFLIf32KkeyzisbwUCgcaQ==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-google-analytics/-/plugin-google-analytics-2.4.3.tgz", + "integrity": "sha512-KzBV3k8lDkWOhg/oYGxlK5o9bOwX7KpPc/FTWoB+SfKhlHfhq7qcQdMi1elAaVEIop8tgK6gD1E58Q+XC6otSQ==", "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "@docusaurus/core": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "tslib": "^2.4.0" }, "engines": { @@ -2389,14 +2503,33 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, + "node_modules/@docusaurus/plugin-google-analytics/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, "node_modules/@docusaurus/plugin-google-gtag": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/plugin-google-gtag/-/plugin-google-gtag-2.4.1.tgz", - "integrity": "sha512-mKIefK+2kGTQBYvloNEKtDmnRD7bxHLsBcxgnbt4oZwzi2nxCGjPX6+9SQO2KCN5HZbNrYmGo5GJfMgoRvy6uA==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-google-gtag/-/plugin-google-gtag-2.4.3.tgz", + "integrity": "sha512-5FMg0rT7sDy4i9AGsvJC71MQrqQZwgLNdDetLEGDHLfSHLvJhQbTCUGbGXknUgWXQJckcV/AILYeJy+HhxeIFA==", "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "@docusaurus/core": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "tslib": "^2.4.0" }, "engines": { @@ -2407,14 +2540,33 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, + "node_modules/@docusaurus/plugin-google-gtag/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, "node_modules/@docusaurus/plugin-google-tag-manager": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/plugin-google-tag-manager/-/plugin-google-tag-manager-2.4.1.tgz", - "integrity": "sha512-Zg4Ii9CMOLfpeV2nG74lVTWNtisFaH9QNtEw48R5QE1KIwDBdTVaiSA18G1EujZjrzJJzXN79VhINSbOJO/r3g==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-google-tag-manager/-/plugin-google-tag-manager-2.4.3.tgz", + "integrity": "sha512-1jTzp71yDGuQiX9Bi0pVp3alArV0LSnHXempvQTxwCGAEzUWWaBg4d8pocAlTpbP9aULQQqhgzrs8hgTRPOM0A==", "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "@docusaurus/core": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "tslib": "^2.4.0" }, "engines": { @@ -2425,17 +2577,36 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, - "node_modules/@docusaurus/plugin-sitemap": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/plugin-sitemap/-/plugin-sitemap-2.4.1.tgz", - "integrity": "sha512-lZx+ijt/+atQ3FVE8FOHV/+X3kuok688OydDXrqKRJyXBJZKgGjA2Qa8RjQ4f27V2woaXhtnyrdPop/+OjVMRg==", + "node_modules/@docusaurus/plugin-google-tag-manager/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/logger": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils": "2.4.1", - "@docusaurus/utils-common": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, + "node_modules/@docusaurus/plugin-sitemap": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-sitemap/-/plugin-sitemap-2.4.3.tgz", + "integrity": "sha512-LRQYrK1oH1rNfr4YvWBmRzTL0LN9UAPxBbghgeFRBm5yloF6P+zv1tm2pe2hQTX/QP5bSKdnajCvfnScgKXMZQ==", + "dependencies": { + "@docusaurus/core": "2.4.3", + "@docusaurus/logger": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils": "2.4.3", + "@docusaurus/utils-common": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "fs-extra": "^10.1.0", "sitemap": "^7.1.1", "tslib": "^2.4.0" @@ -2448,24 +2619,43 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, + "node_modules/@docusaurus/plugin-sitemap/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, "node_modules/@docusaurus/preset-classic": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/preset-classic/-/preset-classic-2.4.1.tgz", - "integrity": "sha512-P4//+I4zDqQJ+UDgoFrjIFaQ1MeS9UD1cvxVQaI6O7iBmiHQm0MGROP1TbE7HlxlDPXFJjZUK3x3cAoK63smGQ==", - "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/plugin-content-blog": "2.4.1", - "@docusaurus/plugin-content-docs": "2.4.1", - "@docusaurus/plugin-content-pages": "2.4.1", - "@docusaurus/plugin-debug": "2.4.1", - "@docusaurus/plugin-google-analytics": "2.4.1", - "@docusaurus/plugin-google-gtag": "2.4.1", - "@docusaurus/plugin-google-tag-manager": "2.4.1", - "@docusaurus/plugin-sitemap": "2.4.1", - "@docusaurus/theme-classic": "2.4.1", - "@docusaurus/theme-common": "2.4.1", - "@docusaurus/theme-search-algolia": "2.4.1", - "@docusaurus/types": "2.4.1" + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/preset-classic/-/preset-classic-2.4.3.tgz", + "integrity": "sha512-tRyMliepY11Ym6hB1rAFSNGwQDpmszvWYJvlK1E+md4SW8i6ylNHtpZjaYFff9Mdk3i/Pg8ItQq9P0daOJAvQw==", + "dependencies": { + "@docusaurus/core": "2.4.3", + "@docusaurus/plugin-content-blog": "2.4.3", + "@docusaurus/plugin-content-docs": "2.4.3", + "@docusaurus/plugin-content-pages": "2.4.3", + "@docusaurus/plugin-debug": "2.4.3", + "@docusaurus/plugin-google-analytics": "2.4.3", + "@docusaurus/plugin-google-gtag": "2.4.3", + "@docusaurus/plugin-google-tag-manager": "2.4.3", + "@docusaurus/plugin-sitemap": "2.4.3", + "@docusaurus/theme-classic": "2.4.3", + "@docusaurus/theme-common": "2.4.3", + "@docusaurus/theme-search-algolia": "2.4.3", + "@docusaurus/types": "2.4.3" }, "engines": { "node": ">=16.14" @@ -2475,6 +2665,25 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, + "node_modules/@docusaurus/preset-classic/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, "node_modules/@docusaurus/react-loadable": { "version": "5.5.2", "resolved": "https://registry.npmjs.org/@docusaurus/react-loadable/-/react-loadable-5.5.2.tgz", @@ -2488,22 +2697,22 @@ } }, "node_modules/@docusaurus/theme-classic": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/theme-classic/-/theme-classic-2.4.1.tgz", - "integrity": "sha512-Rz0wKUa+LTW1PLXmwnf8mn85EBzaGSt6qamqtmnh9Hflkc+EqiYMhtUJeLdV+wsgYq4aG0ANc+bpUDpsUhdnwg==", - "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/mdx-loader": "2.4.1", - "@docusaurus/module-type-aliases": "2.4.1", - "@docusaurus/plugin-content-blog": "2.4.1", - "@docusaurus/plugin-content-docs": "2.4.1", - "@docusaurus/plugin-content-pages": "2.4.1", - "@docusaurus/theme-common": "2.4.1", - "@docusaurus/theme-translations": "2.4.1", - "@docusaurus/types": "2.4.1", - "@docusaurus/utils": "2.4.1", - "@docusaurus/utils-common": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/theme-classic/-/theme-classic-2.4.3.tgz", + "integrity": "sha512-QKRAJPSGPfDY2yCiPMIVyr+MqwZCIV2lxNzqbyUW0YkrlmdzzP3WuQJPMGLCjWgQp/5c9kpWMvMxjhpZx1R32Q==", + "dependencies": { + "@docusaurus/core": "2.4.3", + "@docusaurus/mdx-loader": "2.4.3", + "@docusaurus/module-type-aliases": "2.4.3", + "@docusaurus/plugin-content-blog": "2.4.3", + "@docusaurus/plugin-content-docs": "2.4.3", + "@docusaurus/plugin-content-pages": "2.4.3", + "@docusaurus/theme-common": "2.4.3", + "@docusaurus/theme-translations": "2.4.3", + "@docusaurus/types": "2.4.3", + "@docusaurus/utils": "2.4.3", + "@docusaurus/utils-common": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "@mdx-js/react": "^1.6.22", "clsx": "^1.2.1", "copy-text-to-clipboard": "^3.0.1", @@ -2526,18 +2735,56 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, - "node_modules/@docusaurus/theme-common": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/theme-common/-/theme-common-2.4.1.tgz", - "integrity": "sha512-G7Zau1W5rQTaFFB3x3soQoZpkgMbl/SYNG8PfMFIjKa3M3q8n0m/GRf5/H/e5BqOvt8c+ZWIXGCiz+kUCSHovA==", + "node_modules/@docusaurus/theme-classic/node_modules/@docusaurus/module-type-aliases": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/module-type-aliases/-/module-type-aliases-2.4.3.tgz", + "integrity": "sha512-cwkBkt1UCiduuvEAo7XZY01dJfRn7UR/75mBgOdb1hKknhrabJZ8YH+7savd/y9kLExPyrhe0QwdS9GuzsRRIA==", + "dependencies": { + "@docusaurus/react-loadable": "5.5.2", + "@docusaurus/types": "2.4.3", + "@types/history": "^4.7.11", + "@types/react": "*", + "@types/react-router-config": "*", + "@types/react-router-dom": "*", + "react-helmet-async": "*", + "react-loadable": "npm:@docusaurus/react-loadable@5.5.2" + }, + "peerDependencies": { + "react": "*", + "react-dom": "*" + } + }, + "node_modules/@docusaurus/theme-classic/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", "dependencies": { - "@docusaurus/mdx-loader": "2.4.1", - "@docusaurus/module-type-aliases": "2.4.1", - "@docusaurus/plugin-content-blog": "2.4.1", - "@docusaurus/plugin-content-docs": "2.4.1", - "@docusaurus/plugin-content-pages": "2.4.1", - "@docusaurus/utils": "2.4.1", - "@docusaurus/utils-common": "2.4.1", + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, + "node_modules/@docusaurus/theme-common": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/theme-common/-/theme-common-2.4.3.tgz", + "integrity": "sha512-7KaDJBXKBVGXw5WOVt84FtN8czGWhM0lbyWEZXGp8AFfL6sZQfRTluFp4QriR97qwzSyOfQb+nzcDZZU4tezUw==", + "dependencies": { + "@docusaurus/mdx-loader": "2.4.3", + "@docusaurus/module-type-aliases": "2.4.3", + "@docusaurus/plugin-content-blog": "2.4.3", + "@docusaurus/plugin-content-docs": "2.4.3", + "@docusaurus/plugin-content-pages": "2.4.3", + "@docusaurus/utils": "2.4.3", + "@docusaurus/utils-common": "2.4.3", "@types/history": "^4.7.11", "@types/react": "*", "@types/react-router-config": "*", @@ -2556,19 +2803,57 @@ "react-dom": "^16.8.4 || ^17.0.0" } }, + "node_modules/@docusaurus/theme-common/node_modules/@docusaurus/module-type-aliases": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/module-type-aliases/-/module-type-aliases-2.4.3.tgz", + "integrity": "sha512-cwkBkt1UCiduuvEAo7XZY01dJfRn7UR/75mBgOdb1hKknhrabJZ8YH+7savd/y9kLExPyrhe0QwdS9GuzsRRIA==", + "dependencies": { + "@docusaurus/react-loadable": "5.5.2", + "@docusaurus/types": "2.4.3", + "@types/history": "^4.7.11", + "@types/react": "*", + "@types/react-router-config": "*", + "@types/react-router-dom": "*", + "react-helmet-async": "*", + "react-loadable": "npm:@docusaurus/react-loadable@5.5.2" + }, + "peerDependencies": { + "react": "*", + "react-dom": "*" + } + }, + "node_modules/@docusaurus/theme-common/node_modules/@docusaurus/types": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.3.tgz", + "integrity": "sha512-W6zNLGQqfrp/EoPD0bhb9n7OobP+RHpmvVzpA+Z/IuU3Q63njJM24hmT0GYboovWcDtFmnIJC9wcyx4RVPQscw==", + "dependencies": { + "@types/history": "^4.7.11", + "@types/react": "*", + "commander": "^5.1.0", + "joi": "^17.6.0", + "react-helmet-async": "^1.3.0", + "utility-types": "^3.10.0", + "webpack": "^5.73.0", + "webpack-merge": "^5.8.0" + }, + "peerDependencies": { + "react": "^16.8.4 || ^17.0.0", + "react-dom": "^16.8.4 || ^17.0.0" + } + }, "node_modules/@docusaurus/theme-search-algolia": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/theme-search-algolia/-/theme-search-algolia-2.4.1.tgz", - "integrity": "sha512-6BcqW2lnLhZCXuMAvPRezFs1DpmEKzXFKlYjruuas+Xy3AQeFzDJKTJFIm49N77WFCTyxff8d3E4Q9pi/+5McQ==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/theme-search-algolia/-/theme-search-algolia-2.4.3.tgz", + "integrity": "sha512-jziq4f6YVUB5hZOB85ELATwnxBz/RmSLD3ksGQOLDPKVzat4pmI8tddNWtriPpxR04BNT+ZfpPUMFkNFetSW1Q==", "dependencies": { "@docsearch/react": "^3.1.1", - "@docusaurus/core": "2.4.1", - "@docusaurus/logger": "2.4.1", - "@docusaurus/plugin-content-docs": "2.4.1", - "@docusaurus/theme-common": "2.4.1", - "@docusaurus/theme-translations": "2.4.1", - "@docusaurus/utils": "2.4.1", - "@docusaurus/utils-validation": "2.4.1", + "@docusaurus/core": "2.4.3", + "@docusaurus/logger": "2.4.3", + "@docusaurus/plugin-content-docs": "2.4.3", + "@docusaurus/theme-common": "2.4.3", + "@docusaurus/theme-translations": "2.4.3", + "@docusaurus/utils": "2.4.3", + "@docusaurus/utils-validation": "2.4.3", "algoliasearch": "^4.13.1", "algoliasearch-helper": "^3.10.0", "clsx": "^1.2.1", @@ -2587,9 +2872,9 @@ } }, "node_modules/@docusaurus/theme-translations": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/theme-translations/-/theme-translations-2.4.1.tgz", - "integrity": "sha512-T1RAGP+f86CA1kfE8ejZ3T3pUU3XcyvrGMfC/zxCtc2BsnoexuNI9Vk2CmuKCb+Tacvhxjv5unhxXce0+NKyvA==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/theme-translations/-/theme-translations-2.4.3.tgz", + "integrity": "sha512-H4D+lbZbjbKNS/Zw1Lel64PioUAIT3cLYYJLUf3KkuO/oc9e0QCVhIYVtUI2SfBCF2NNdlyhBDQEEMygsCedIg==", "dependencies": { "fs-extra": "^10.1.0", "tslib": "^2.4.0" @@ -2602,6 +2887,7 @@ "version": "2.4.1", "resolved": "https://registry.npmjs.org/@docusaurus/types/-/types-2.4.1.tgz", "integrity": "sha512-0R+cbhpMkhbRXX138UOc/2XZFF8hiZa6ooZAEEJFp5scytzCw4tC1gChMFXrpa3d2tYE6AX8IrOEpSonLmfQuQ==", + "devOptional": true, "dependencies": { "@types/history": "^4.7.11", "@types/react": "*", @@ -2618,11 +2904,11 @@ } }, "node_modules/@docusaurus/utils": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/utils/-/utils-2.4.1.tgz", - "integrity": "sha512-1lvEZdAQhKNht9aPXPoh69eeKnV0/62ROhQeFKKxmzd0zkcuE/Oc5Gpnt00y/f5bIsmOsYMY7Pqfm/5rteT5GA==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/utils/-/utils-2.4.3.tgz", + "integrity": "sha512-fKcXsjrD86Smxv8Pt0TBFqYieZZCPh4cbf9oszUq/AMhZn3ujwpKaVYZACPX8mmjtYx0JOgNx52CREBfiGQB4A==", "dependencies": { - "@docusaurus/logger": "2.4.1", + "@docusaurus/logger": "2.4.3", "@svgr/webpack": "^6.2.1", "escape-string-regexp": "^4.0.0", "file-loader": "^6.2.0", @@ -2652,9 +2938,9 @@ } }, "node_modules/@docusaurus/utils-common": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/utils-common/-/utils-common-2.4.1.tgz", - "integrity": "sha512-bCVGdZU+z/qVcIiEQdyx0K13OC5mYwxhSuDUR95oFbKVuXYRrTVrwZIqQljuo1fyJvFTKHiL9L9skQOPokuFNQ==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/utils-common/-/utils-common-2.4.3.tgz", + "integrity": "sha512-/jascp4GbLQCPVmcGkPzEQjNaAk3ADVfMtudk49Ggb+131B1WDD6HqlSmDf8MxGdy7Dja2gc+StHf01kiWoTDQ==", "dependencies": { "tslib": "^2.4.0" }, @@ -2671,12 +2957,12 @@ } }, "node_modules/@docusaurus/utils-validation": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@docusaurus/utils-validation/-/utils-validation-2.4.1.tgz", - "integrity": "sha512-unII3hlJlDwZ3w8U+pMO3Lx3RhI4YEbY3YNsQj4yzrkZzlpqZOLuAiZK2JyULnD+TKbceKU0WyWkQXtYbLNDFA==", + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/@docusaurus/utils-validation/-/utils-validation-2.4.3.tgz", + "integrity": "sha512-G2+Vt3WR5E/9drAobP+hhZQMaswRwDlp6qOMi7o7ZypB+VO7N//DZWhZEwhcRGepMDJGQEwtPv7UxtYwPL9PBw==", "dependencies": { - "@docusaurus/logger": "2.4.1", - "@docusaurus/utils": "2.4.1", + "@docusaurus/logger": "2.4.3", + "@docusaurus/utils": "2.4.3", "joi": "^17.6.0", "js-yaml": "^4.1.0", "tslib": "^2.4.0" @@ -3330,9 +3616,9 @@ } }, "node_modules/@types/hast": { - "version": "2.3.5", - "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.5.tgz", - "integrity": "sha512-SvQi0L/lNpThgPoleH53cdjB3y9zpLlVjRbqB3rH8hx1jiRSBGAhyjV3H+URFjNVRqt2EdYNrbZE5IsGlNfpRg==", + "version": "2.3.10", + "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz", + "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==", "dependencies": { "@types/unist": "^2" } @@ -3387,9 +3673,9 @@ "integrity": "sha512-Hr5Jfhc9eYOQNPYO5WLDq/n4jqijdHNlDXjuAQkkt+mWdQR+XJToOHrsD4cPaMXpn6KO7y2+wM8AZEs8VpBLVA==" }, "node_modules/@types/mdast": { - "version": "3.0.12", - "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-3.0.12.tgz", - "integrity": "sha512-DT+iNIRNX884cx0/Q1ja7NyUPpZuv0KPyL5rGNxm1WC1OtHstl7n4Jb7nk+xacNShQMbczJjt8uFzznpp6kYBg==", + "version": "3.0.15", + "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-3.0.15.tgz", + "integrity": "sha512-LnwD+mUEfxWMa1QpDraczIn6k0Ee3SMicuYSSzS6ZYl2gKS09EClnJYGd8Du6rfc5r/GZEk5o1mRb8TaTj03sQ==", "dependencies": { "@types/unist": "^2" } @@ -3474,9 +3760,9 @@ "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==" }, "node_modules/@types/sax": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@types/sax/-/sax-1.2.4.tgz", - "integrity": "sha512-pSAff4IAxJjfAXUG6tFkO7dsSbTmf8CtUpfhhZ5VhkRpC4628tJhh3+V6H1E+/Gs9piSzYKT5yzHO5M4GG9jkw==", + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/@types/sax/-/sax-1.2.7.tgz", + "integrity": "sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==", "dependencies": { "@types/node": "*" } @@ -3522,9 +3808,9 @@ } }, "node_modules/@types/unist": { - "version": "2.0.8", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.8.tgz", - "integrity": "sha512-d0XxK3YTObnWVp6rZuev3c49+j4Lo8g4L1ZRm9z5L0xpoZycUPshHgczK5gsUMaZOstjVYYi09p5gYvUtfChYw==" + "version": "2.0.10", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.10.tgz", + "integrity": "sha512-IfYcSBWE3hLpBg8+X2SEa8LVkJdJEkT2Ese2aaLs3ptGdVtABxndrMaxuFlQ1qdFf9Q5rDvDpxI3WwgvKFAsQA==" }, "node_modules/@types/ws": { "version": "8.5.5", @@ -3826,30 +4112,31 @@ } }, "node_modules/algoliasearch": { - "version": "4.19.1", - "resolved": "https://registry.npmjs.org/algoliasearch/-/algoliasearch-4.19.1.tgz", - "integrity": "sha512-IJF5b93b2MgAzcE/tuzW0yOPnuUyRgGAtaPv5UUywXM8kzqfdwZTO4sPJBzoGz1eOy6H9uEchsJsBFTELZSu+g==", - "dependencies": { - "@algolia/cache-browser-local-storage": "4.19.1", - "@algolia/cache-common": "4.19.1", - "@algolia/cache-in-memory": "4.19.1", - "@algolia/client-account": "4.19.1", - "@algolia/client-analytics": "4.19.1", - "@algolia/client-common": "4.19.1", - "@algolia/client-personalization": "4.19.1", - "@algolia/client-search": "4.19.1", - "@algolia/logger-common": "4.19.1", - "@algolia/logger-console": "4.19.1", - "@algolia/requester-browser-xhr": "4.19.1", - "@algolia/requester-common": "4.19.1", - "@algolia/requester-node-http": "4.19.1", - "@algolia/transporter": "4.19.1" + "version": "4.23.3", + "resolved": "https://registry.npmjs.org/algoliasearch/-/algoliasearch-4.23.3.tgz", + "integrity": "sha512-Le/3YgNvjW9zxIQMRhUHuhiUjAlKY/zsdZpfq4dlLqg6mEm0nL6yk+7f2hDOtLpxsgE4jSzDmvHL7nXdBp5feg==", + "dependencies": { + "@algolia/cache-browser-local-storage": "4.23.3", + "@algolia/cache-common": "4.23.3", + "@algolia/cache-in-memory": "4.23.3", + "@algolia/client-account": "4.23.3", + "@algolia/client-analytics": "4.23.3", + "@algolia/client-common": "4.23.3", + "@algolia/client-personalization": "4.23.3", + "@algolia/client-search": "4.23.3", + "@algolia/logger-common": "4.23.3", + "@algolia/logger-console": "4.23.3", + "@algolia/recommend": "4.23.3", + "@algolia/requester-browser-xhr": "4.23.3", + "@algolia/requester-common": "4.23.3", + "@algolia/requester-node-http": "4.23.3", + "@algolia/transporter": "4.23.3" } }, "node_modules/algoliasearch-helper": { - "version": "3.14.0", - "resolved": "https://registry.npmjs.org/algoliasearch-helper/-/algoliasearch-helper-3.14.0.tgz", - "integrity": "sha512-gXDXzsSS0YANn5dHr71CUXOo84cN4azhHKUbg71vAWnH+1JBiR4jf7to3t3JHXknXkbV0F7f055vUSBKrltHLQ==", + "version": "3.21.0", + "resolved": "https://registry.npmjs.org/algoliasearch-helper/-/algoliasearch-helper-3.21.0.tgz", + "integrity": "sha512-hjVOrL15I3Y3K8xG0icwG1/tWE+MocqBrhW6uVBWpU+/kVEMK0BnM2xdssj6mZM61eJ4iRxHR0djEI3ENOpR8w==", "dependencies": { "@algolia/events": "^4.0.1" }, @@ -3965,9 +4252,9 @@ } }, "node_modules/autoprefixer": { - "version": "10.4.15", - "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.15.tgz", - "integrity": "sha512-KCuPB8ZCIqFdA4HwKXsvz7j6gvSDNhDP7WnUjBleRkKjPdvCmHFuQ77ocavI8FT6NdvlBnE2UFr2H4Mycn8Vew==", + "version": "10.4.19", + "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.19.tgz", + "integrity": "sha512-BaENR2+zBZ8xXhM4pUaKUxlVdxZ0EZhjvbopwnXmxRUfqDmwSpC2lAi/QXvx7NRdPCo1WKEcEF6mV64si1z4Ew==", "funding": [ { "type": "opencollective", @@ -3983,9 +4270,9 @@ } ], "dependencies": { - "browserslist": "^4.21.10", - "caniuse-lite": "^1.0.30001520", - "fraction.js": "^4.2.0", + "browserslist": "^4.23.0", + "caniuse-lite": "^1.0.30001599", + "fraction.js": "^4.3.7", "normalize-range": "^0.1.2", "picocolors": "^1.0.0", "postcss-value-parser": "^4.2.0" @@ -4258,9 +4545,9 @@ } }, "node_modules/browserslist": { - "version": "4.21.10", - "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.21.10.tgz", - "integrity": "sha512-bipEBdZfVH5/pwrvqc+Ub0kUPVfGUhlKxbvfD+z1BDnPEO/X98ruXGA1WP5ASpAFKan7Qr6j736IacbZQuAlKQ==", + "version": "4.23.0", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.23.0.tgz", + "integrity": "sha512-QW8HiM1shhT2GuzkvklfjcKDiWFXHOeFCIA/huJPwHsslwcydgk7X+z2zXpEijP98UCY7HbubZt5J2Zgvf0CaQ==", "funding": [ { "type": "opencollective", @@ -4276,10 +4563,10 @@ } ], "dependencies": { - "caniuse-lite": "^1.0.30001517", - "electron-to-chromium": "^1.4.477", - "node-releases": "^2.0.13", - "update-browserslist-db": "^1.0.11" + "caniuse-lite": "^1.0.30001587", + "electron-to-chromium": "^1.4.668", + "node-releases": "^2.0.14", + "update-browserslist-db": "^1.0.13" }, "bin": { "browserslist": "cli.js" @@ -4414,9 +4701,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001528", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001528.tgz", - "integrity": "sha512-0Db4yyjR9QMNlsxh+kKWzQtkyflkG/snYheSzkjmvdEtEXB1+jt7A2HmSEiO6XIJPIbo92lHNGNySvE5pZcs5Q==", + "version": "1.0.30001629", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001629.tgz", + "integrity": "sha512-c3dl911slnQhmxUIT4HhYzT7wnBK/XYpGnYLOj4nJBaRiw52Ibe7YxlDaAeRECvA786zCuExhxIUJ2K7nHMrBw==", "funding": [ { "type": "opencollective", @@ -5811,9 +6098,9 @@ "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==" }, "node_modules/electron-to-chromium": { - "version": "1.4.510", - "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.510.tgz", - "integrity": "sha512-xPfLIPFcN/WLXBpQ/K4UgE98oUBO5Tia6BD4rkSR0wE7ep/PwBVlgvPJQrIBpmJGVAmUzwPKuDbVt9XV6+uC2g==" + "version": "1.4.791", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.791.tgz", + "integrity": "sha512-6FlqP0NSWvxFf1v+gHu+LCn5wjr1pmkj5nPr7BsxPnj41EDR4EWhK/KmQN0ytHUqgTR1lkpHRYxvHBLZFQtkKw==" }, "node_modules/emoji-regex": { "version": "9.2.2", @@ -5909,9 +6196,9 @@ "integrity": "sha512-vZK7T0N2CBmBOixhmjdqx2gWVbFZ4DXZ/NyRMZVlJXPa7CyFS+/a4QQsDGDQy9ZfEzxFuNEsMLeQJnKP2p5/JA==" }, "node_modules/escalade": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz", - "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.2.tgz", + "integrity": "sha512-ErCHMCae19vR8vQGe50xIsVomy19rg6gFu3+r3jkEO46suLMWBksvVyoGgQV+jOfl84ZSOSlmv6Gxa89PmTGmA==", "engines": { "node": ">=6" } @@ -6510,9 +6797,9 @@ } }, "node_modules/fraction.js": { - "version": "4.3.6", - "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.6.tgz", - "integrity": "sha512-n2aZ9tNfYDwaHhvFTkhFErqOMIb8uyzSQ+vGJBjZyanAKZVbGUQ1sngfk9FdkBw7G26O7AgNjLcecLffD1c7eg==", + "version": "4.3.7", + "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz", + "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==", "engines": { "node": "*" }, @@ -7302,9 +7589,9 @@ } }, "node_modules/image-size": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/image-size/-/image-size-1.0.2.tgz", - "integrity": "sha512-xfOoWjceHntRb3qFCrh5ZFORYH8XCdYpASltMhZ/Q0KZiOwjdE/Yl2QCiWdwD+lygV5bMCvauzgu5PxBX/Yerg==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/image-size/-/image-size-1.1.1.tgz", + "integrity": "sha512-541xKlUw6jr/6gGuk92F+mYM5zaFAc5ahphvkqvNe2bQ6gVBkd6bfrmVJ2t4KDAfikAYZyIqTnktX3i6/aQDrQ==", "dependencies": { "queue": "6.0.2" }, @@ -7312,7 +7599,7 @@ "image-size": "bin/image-size.js" }, "engines": { - "node": ">=14.0.0" + "node": ">=16.x" } }, "node_modules/immer": { @@ -8433,9 +8720,9 @@ } }, "node_modules/node-releases": { - "version": "2.0.13", - "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.13.tgz", - "integrity": "sha512-uYr7J37ae/ORWdZeQ1xxMJe3NtdmqMC/JZK+geofDrkLUApKRHPd18/TxtBOJ4A0/+uUIliorNrfYV6s1b02eQ==" + "version": "2.0.14", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.14.tgz", + "integrity": "sha512-y10wOWt8yZpqXmOgRo77WaHEmhYQYGNA6y421PKsKYWEK8aW+cqAphborZDhqfyKrbZEN92CN1X2KbafY2s7Yw==" }, "node_modules/node-watch": { "version": "0.7.4", @@ -8849,9 +9136,9 @@ } }, "node_modules/picocolors": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz", - "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==" + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.1.tgz", + "integrity": "sha512-anP1Z8qwhkbmu7MFP5iTt+wQKXgwzf7zTyGlcdzabySa9vd0Xt392U0rVmz9poOaBj0uHJKyyo9/upk0HrEQew==" }, "node_modules/picomatch": { "version": "2.3.1", @@ -10718,9 +11005,9 @@ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, "node_modules/sax": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", - "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==" + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz", + "integrity": "sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==" }, "node_modules/scheduler": { "version": "0.20.2", @@ -10754,9 +11041,9 @@ "integrity": "sha512-qGVDoreyYiP1pkQnbnFAUIS5AjenNwwQBdl7zeos9etl+hYKWahjRTfzAZZYBv5xNHx7vNKCmaLDQZ6Fr2AEXg==" }, "node_modules/search-insights": { - "version": "2.8.2", - "resolved": "https://registry.npmjs.org/search-insights/-/search-insights-2.8.2.tgz", - "integrity": "sha512-PxA9M5Q2bpBelVvJ3oDZR8nuY00Z6qwOxL53wNpgzV28M/D6u9WUbImDckjLSILBF8F1hn/mgyuUaOPtjow4Qw==", + "version": "2.14.0", + "resolved": "https://registry.npmjs.org/search-insights/-/search-insights-2.14.0.tgz", + "integrity": "sha512-OLN6MsPMCghDOqlCtsIsYgtsC0pnwVTyT9Mu6A3ewOj1DxvzZF6COrn2g86E/c05xbktB0XN04m/t1Z+n+fTGw==", "peer": true }, "node_modules/section-matter": { @@ -11123,9 +11410,9 @@ "integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==" }, "node_modules/sitemap": { - "version": "7.1.1", - "resolved": "https://registry.npmjs.org/sitemap/-/sitemap-7.1.1.tgz", - "integrity": "sha512-mK3aFtjz4VdJN0igpIJrinf3EO8U8mxOPsTBzSsy06UtjZQJ3YY3o3Xa7zSc5nMqcMrRwlChHZ18Kxg0caiPBg==", + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/sitemap/-/sitemap-7.1.2.tgz", + "integrity": "sha512-ARCqzHJ0p4gWt+j7NlU5eDlIO9+Rkr/JhPFZKKQ1l5GCus7rJH4UdrlVAh0xC/gDS/Qir2UMxqYNHtsKr2rpCw==", "dependencies": { "@types/node": "^17.0.5", "@types/sax": "^1.2.1", @@ -11814,9 +12101,9 @@ } }, "node_modules/ua-parser-js": { - "version": "1.0.35", - "resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-1.0.35.tgz", - "integrity": "sha512-fKnGuqmTBnIE+/KXSzCn4db8RTigUzw1AN0DmdU6hJovUTbYJKyqj+8Mt1c4VfRDnOVJnENmfYkIPZ946UrSAA==", + "version": "1.0.38", + "resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-1.0.38.tgz", + "integrity": "sha512-Aq5ppTOfvrCMgAPneW1HfWj66Xi7XL+/mIy996R1/CLS/rcyJQm6QZdsKrUeivDFQ+Oc9Wyuwor8Ze8peEoUoQ==", "funding": [ { "type": "opencollective", @@ -11825,6 +12112,10 @@ { "type": "paypal", "url": "https://paypal.me/faisalman" + }, + { + "type": "github", + "url": "https://github.com/sponsors/faisalman" } ], "engines": { @@ -12024,9 +12315,9 @@ } }, "node_modules/update-browserslist-db": { - "version": "1.0.11", - "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.11.tgz", - "integrity": "sha512-dCwEFf0/oT85M1fHBg4F0jtLwJrutGoHSQXCh7u4o2t1drG+c0a9Flnqww6XUKSfQMPpJBRjU8d4RXB09qtvaA==", + "version": "1.0.16", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.16.tgz", + "integrity": "sha512-KVbTxlBYlckhF5wgfyZXTWnMn7MMZjMu9XG8bPlliUOP9ThaF4QnhP8qrjrH7DRzHfSk0oQv1wToW+iA5GajEQ==", "funding": [ { "type": "opencollective", @@ -12042,8 +12333,8 @@ } ], "dependencies": { - "escalade": "^3.1.1", - "picocolors": "^1.0.0" + "escalade": "^3.1.2", + "picocolors": "^1.0.1" }, "bin": { "update-browserslist-db": "cli.js" @@ -12294,9 +12585,9 @@ } }, "node_modules/use-sync-external-store": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz", - "integrity": "sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.2.tgz", + "integrity": "sha512-PElTlVMwpblvbNqQ82d2n6RjStvdSoNe9FG28kNfz3WiXilJm4DdNkEzRhCZuIDwY8U08WVihhGR5iRqAwfDiw==", "peerDependencies": { "react": "^16.8.0 || ^17.0.0 || ^18.0.0" } diff --git a/docs/website/package.json b/docs/website/package.json index 1b1a5b1801..becf1c8bc6 100644 --- a/docs/website/package.json +++ b/docs/website/package.json @@ -16,8 +16,8 @@ "generate-api-reference": "PYTHONPATH=. poetry run pydoc-markdown" }, "dependencies": { - "@docusaurus/core": "2.4.1", - "@docusaurus/preset-classic": "2.4.1", + "@docusaurus/core": "2.4.3", + "@docusaurus/preset-classic": "2.4.3", "@mdx-js/react": "^1.6.22", "clsx": "^1.2.1", "dedent": "^1.5.1", @@ -49,6 +49,6 @@ ] }, "engines": { - "node": ">=16.14" + "node": ">=20.10" } } From 285b9dffe7d9dbfbc1ee6854bdaf89832382423e Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 7 Jun 2024 20:09:48 +0200 Subject: [PATCH 13/61] Remove unused imports (#1448) --- dlt/sources/helpers/rest_client/__init__.py | 2 +- dlt/sources/helpers/rest_client/client.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/dlt/sources/helpers/rest_client/__init__.py b/dlt/sources/helpers/rest_client/__init__.py index b2fb0a2351..c9fe0a4dce 100644 --- a/dlt/sources/helpers/rest_client/__init__.py +++ b/dlt/sources/helpers/rest_client/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Iterator, Union, Any +from typing import Optional, Dict, Iterator, Any from dlt.common import jsonpath diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index dc7304f159..12db4310da 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -6,7 +6,6 @@ Any, TypeVar, Iterable, - Union, cast, ) import copy @@ -21,7 +20,6 @@ from .typing import HTTPMethodBasic, HTTPMethod, Hooks from .paginators import BasePaginator -from .auth import AuthConfigBase from .detector import PaginatorFactory, find_response_page_data from .exceptions import IgnoreResponseException, PaginatorNotFound From 3609757c58316c622788dc7dd3a473a86aa2d71f Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:17:18 +0530 Subject: [PATCH 14/61] Docs: Updated filesystem docs with explanations for bucket URLs (#1435) * Updated source/filesystem docs with explanations for bucket URLs * Updated * Updated as per comments --- docs/tools/package-lock.json | 6 ++ .../verified-sources/filesystem.md | 98 +++++++++++++------ 2 files changed, 74 insertions(+), 30 deletions(-) create mode 100644 docs/tools/package-lock.json diff --git a/docs/tools/package-lock.json b/docs/tools/package-lock.json new file mode 100644 index 0000000000..84291b66a8 --- /dev/null +++ b/docs/tools/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "tools", + "lockfileVersion": 3, + "requires": true, + "packages": {} +} diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md index 5c322db108..7552a0acb2 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem.md @@ -40,8 +40,8 @@ To access these, you'll need secret credentials: To get AWS keys for S3 access: 1. Access IAM in AWS Console. -1. Select "Users", choose a user, and open "Security credentials". -1. Click "Create access key" for AWS ID and Secret Key. +2. Select "Users", choose a user, and open "Security credentials". +3. Click "Create access key" for AWS ID and Secret Key. For more info, see [AWS official documentation.](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html) @@ -51,12 +51,12 @@ For more info, see To get GCS/GDrive access: 1. Log in to [console.cloud.google.com](http://console.cloud.google.com/). -1. Create a [service account](https://cloud.google.com/iam/docs/service-accounts-create#creating). -1. Enable "Cloud Storage API" / "Google Drive API"; see +2. Create a [service account](https://cloud.google.com/iam/docs/service-accounts-create#creating). +3. Enable "Cloud Storage API" / "Google Drive API"; see [Google's guide](https://support.google.com/googleapi/answer/6158841?hl=en). -1. In IAM & Admin > Service Accounts, find your account, click the three-dot menu > "Manage Keys" > +4. In IAM & Admin > Service Accounts, find your account, click the three-dot menu > "Manage Keys" > "ADD KEY" > "CREATE" to get a JSON credential file. -1. Grant the service account appropriate permissions for cloud storage access. +5. Grant the service account appropriate permissions for cloud storage access. For more info, see how to [create service account](https://support.google.com/a/answer/7378726?hl=en). @@ -66,9 +66,9 @@ For more info, see how to To obtain Azure blob storage access: 1. Go to Azure Portal (portal.azure.com). -1. Select "Storage accounts" > your storage. -1. Click "Settings" > "Access keys". -1. View account name and two keys (primary/secondary). Keep keys confidential. +2. Select "Storage accounts" > your storage. +3. Click "Settings" > "Access keys". +4. View account name and two keys (primary/secondary). Keep keys confidential. For more info, see [Azure official documentation](https://learn.microsoft.com/en-us/azure/storage/common/storage-account-keys-manage?tabs=azure-portal). @@ -88,10 +88,10 @@ To get started with your data pipeline, follow these steps: with filesystem as the [source](../../general-usage/source) and [duckdb](../destinations/duckdb.md) as the [destination](../destinations). -1. If you'd like to use a different destination, simply replace `duckdb` with the name of your +2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](../destinations). -1. After running this command, a new directory will be created with the necessary files and +3. After running this command, a new directory will be created with the necessary files and configuration settings to get started. For more information, read the @@ -119,32 +119,71 @@ For more information, read the azure_storage_account_key="Please set me up!" ``` -1. Finally, enter credentials for your chosen destination as per the [docs](../destinations/). +2. Finally, enter credentials for your chosen destination as per the [docs](../destinations/). -1. You can pass the bucket URL and glob pattern or use `config.toml`. For local filesystems, use - `file://` or skip the schema and provide the local path in a format native for your operating system. +3. You can pass the bucket URL and glob pattern or use `config.toml`. For local filesystems, use + `file://` as follows: + + ```toml + [sources.filesystem] # use [sources.readers.credentials] for the "readers" source + bucket_url='file://Users/admin/Documents/csv_files' + file_glob="*" + ``` + or skip the schema and provide the local path in a format native for your operating system as follows: ```toml [sources.filesystem] # use [sources.readers.credentials] for the "readers" source bucket_url='~\Documents\csv_files\' file_glob="*" ``` + In the example above we use Windows path to current user's Documents folder. Mind that literal toml string (single quotes) was used to conveniently use the backslashes without need to escape. For remote file systems you need to add the schema, it will be used to get the protocol being - used: + used. The protocols that can be used are: - ```toml - [sources.filesystem] # use [sources.readers.credentials] for the "readers" source - # bucket_url="az://my-bucket/csv_files/" - for Azure Blob Storage - # bucket_url="gdrive://my-bucket/csv_files/" - for Google Drive folder - # bucket_url="gs://my-bucket/csv_files/" - for Google Storage - bucket_url="s3://my-bucket/csv_files/" # for AWS S3 - ``` - :::caution - For Azure, use adlfs>=2023.9.0. Older versions mishandle globs. - ::: + - For Azure blob storage + ```toml + [sources.filesystem] # use [sources.readers.credentials] for the "readers" source + bucket_url="az:////" + ``` + + - `az://` indicates the Azure Blob Storage protocol. + - `container_name` is the name of the container. + - `path_to_files/` is a directory path within the container. + + `CAUTION: For Azure, use adlfs>=2023.9.0. Older versions mishandle globs.` + + - For Google Drive + ```toml + [sources.filesystem] # use [sources.readers.credentials] for the "readers" source + bucket_url="gdrive:////" + ``` + + - `gdrive://` indicates that the Google Drive protocol. + - `folder_name` refers to a folder within Google Drive. + - `subfolder_or_file_path/` is a sub-folder or directory path within the my-bucket folder. + + - For Google Storage + ```toml + [sources.filesystem] # use [sources.readers.credentials] for the "readers" source + bucket_url="gs:////" + ``` + + - `gs://` indicates the Google Cloud Storage protocol. + - `bucket_name` is the name of the bucket. + - `path_to_files/` is a directory path within the bucket. + + - For AWS S3 + ```toml + [sources.filesystem] # use [sources.readers.credentials] for the "readers" source + bucket_url="s3:////" + ``` + + - `s3://` indicates the AWS S3 protocol. + - `bucket_name` is the name of the bucket. + - `path_to_files/` is a directory path within the bucket. ### Use local file system paths You can use both native local file system paths and in form of `file:` uri. Absolute, relative and UNC Windows paths are supported. @@ -172,7 +211,7 @@ bucket_url = '\\?\C:\a\b\c' pip install -r requirements.txt ``` -1. Install optional modules: +2. Install optional modules: - For AWS S3: ```sh @@ -184,13 +223,13 @@ bucket_url = '\\?\C:\a\b\c' ``` - GCS storage: No separate module needed. -1. You're now ready to run the pipeline! To get started, run the following command: +3. You're now ready to run the pipeline! To get started, run the following command: ```sh python filesystem_pipeline.py ``` -1. Once the pipeline has finished running, you can verify that everything loaded correctly by using +4. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: ```sh @@ -493,5 +532,4 @@ verified source. fs_client.ls("ci-test-bucket/standard_source/samples") ``` - - + \ No newline at end of file From 24066364cf52d428157c1e43cdc0fbb82baf7ccb Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Mon, 10 Jun 2024 16:15:38 +0200 Subject: [PATCH 15/61] Pass environment settings to requests.Session.send, fixes #1447 --- dlt/sources/helpers/rest_client/client.py | 6 +++++- tests/sources/helpers/rest_client/test_client.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index 12db4310da..e6135b5c0f 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -124,7 +124,11 @@ def _send_request(self, request: Request) -> Response: prepared_request = self.session.prepare_request(request) - return self.session.send(prepared_request) + send_kwargs = self.session.merge_environment_settings( + prepared_request.url, {}, None, None, None + ) + + return self.session.send(prepared_request, **send_kwargs) def request(self, path: str = "", method: HTTPMethod = "GET", **kwargs: Any) -> Response: prepared_request = self._create_request( diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 79a57d0e82..bd65affe62 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -241,3 +241,17 @@ def __call__(self, request: PreparedRequest) -> PreparedRequest: assert_pagination(pages_list) assert pages_list[0].response.request.headers["Authorization"] == "Bearer test-token" + + def test_send_request_allows_ca_bundle(self, mocker, rest_client): + mocker.patch.dict(os.environ, {"REQUESTS_CA_BUNDLE": "/path/to/some/ca-bundle"}) + + _send = rest_client.session.send + + def _fake_send(*args, **kwargs): + assert kwargs["verify"] == "/path/to/some/ca-bundle" + return _send(*args, **kwargs) + + rest_client.session.send = _fake_send + + result = rest_client.get("/posts/1") + assert result.status_code == 200 From b108e7f10d1e99bf06d4275a1f43686ca9d9fe95 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Tue, 11 Jun 2024 13:48:54 +0530 Subject: [PATCH 16/61] Added a note for indexing (#1451) --- docs/website/docs/general-usage/incremental-loading.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index c2c951c9b0..72957402da 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -210,6 +210,10 @@ def resource(): ... ``` +:::note +Indexing is important for doing lookups by column value, especially for merge writes, to ensure acceptable performance in some destinations. +::: + #### Forcing root key propagation Merge write disposition requires that the `_dlt_id` of top level table is propagated to child From 3f598b916facff98b8a829d93b86a4611fbad0b9 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 11 Jun 2024 13:54:22 +0200 Subject: [PATCH 17/61] docs for loading with contracts to existing tables (#1441) * add test and some notes on existing table * small update * add section to destination tables --- .../docs/general-usage/destination-tables.md | 36 ++++++++++++++++++- .../docs/general-usage/schema-contracts.md | 22 +++++++----- tests/pipeline/test_schema_contracts.py | 31 +++++++++++++++- 3 files changed, 79 insertions(+), 10 deletions(-) diff --git a/docs/website/docs/general-usage/destination-tables.md b/docs/website/docs/general-usage/destination-tables.md index b53d864a96..7df19eff30 100644 --- a/docs/website/docs/general-usage/destination-tables.md +++ b/docs/website/docs/general-usage/destination-tables.md @@ -303,4 +303,38 @@ load_info = pipeline.run(data, table_name="users") Every time you run this pipeline, a new schema will be created in the destination database with a datetime-based suffix. The data will be loaded into tables in this schema. For example, the first time you run the pipeline, the schema will be named -`mydata_20230912064403`, the second time it will be named `mydata_20230912064407`, and so on. \ No newline at end of file +`mydata_20230912064403`, the second time it will be named `mydata_20230912064407`, and so on. + +## Loading data into existing tables not created by dlt + +You can also load data from `dlt` into tables that already exist in the destination dataset and were not created by `dlt`. +There are a few things to keep in mind when you are doing this: + +If you load data to a table that exists but does not contain any data, in most cases your load will succeed without problems. +`dlt` will create the needed columns and insert the incoming data. `dlt` will only be aware of columns that exist on the +discovered or provided internal schema, so if you have columns in your destination, that are not anticipated by `dlt`, they +will remain in the destination but stay unknown to `dlt`. This will generally not be a problem. + +If your destination table already exists and contains columns that have the same name as columns discovered by `dlt` but +do not have matching datatypes, your load will fail and you will have to fix the column on the destination table first, +or change the column name in your incoming data to something else to avoid a collission. + +If your destination table exists and already contains data, your load might also initially fail, since `dlt` creates +special `non-nullable` columns that contains required mandatory metadata. Some databases will not allow you to create +`non-nullable` columns on tables that have data, since the initial value for these columns of the existing rows can +not be inferred. You will have to manually create these columns with the correct type on your existing tables and +make them `nullable`, then fill in values for the existing rows. Some databases may allow you to create a new column +that is `non-nullable` and take a default value for existing rows in the same command. The columns you will need to +create are: + +| name | type | +| --- | --- | +| _dlt_load_id | text/string/varchar | +| _dlt_id | text/string/varchar | + +For child-tables you may also need to create: + +| name | type | +| --- | --- | +| _dlt_parent_id | text/string/varchar | +| _dlt_root_id | text/string/varchar | \ No newline at end of file diff --git a/docs/website/docs/general-usage/schema-contracts.md b/docs/website/docs/general-usage/schema-contracts.md index 48ed52147d..41185852cc 100644 --- a/docs/website/docs/general-usage/schema-contracts.md +++ b/docs/website/docs/general-usage/schema-contracts.md @@ -151,19 +151,25 @@ Following tables are considered new: For example such table is considered new because column **number** is incomplete (define primary key and NOT null but no data type) ```yaml - blocks: - description: Ethereum blocks - write_disposition: append - columns: - number: - nullable: false - primary_key: true - name: number +blocks: + description: Ethereum blocks + write_disposition: append + columns: + number: + nullable: false + primary_key: true + name: number ``` What tables are not considered new: 1. Those with columns defined by Pydantic modes +### Working with datasets that have manually added tables and columns on the first load + +In some cases you might be working with datasets that have tables or columns created outside of dlt. If you are loading to a table not created by `dlt` for the first time, `dlt` will not know about this table while enforcing schema contracts. This means that if you do a load where the `tables` are set to `evolve`, all will work as planned. If you have `tables` set to `freeze`, dlt will raise an exception because it thinks you are creating a new table (which you are from dlts perspective). You can allow `evolve` for one load and then switch back to `freeze`. + +The same thing will happen if `dlt` knows your table, but you have manually added a column to your destination and you have `columns` set to `freeze`. + ### Code Examples The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py index 4958299368..a46529b861 100644 --- a/tests/pipeline/test_schema_contracts.py +++ b/tests/pipeline/test_schema_contracts.py @@ -176,7 +176,7 @@ def get_pipeline(): import duckdb return dlt.pipeline( - pipeline_name=uniq_id(), + pipeline_name="contracts_" + uniq_id(), destination="duckdb", credentials=duckdb.connect(":memory:"), dev_mode=True, @@ -816,3 +816,32 @@ def get_items_extra_variant(as_list: bool = False): pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] ) assert table_counts[ITEMS_TABLE] == 1 if (contract_setting in ["freeze", "discard_row"]) else 3 + + +def test_write_to_existing_database_tables_frozen() -> None: + pipeline = get_pipeline() + + # Create a database schema with table + with pipeline.sql_client() as c: + table_name = c.make_qualified_table_name("test_items") + c.create_dataset() + c.execute_sql( + f"CREATE TABLE {table_name} (id INTEGER PRIMARY KEY, name VARCHAR NOT NULL," + " _dlt_load_id VARCHAR NOT NULL, _dlt_id VARCHAR NOT NULL)" + ) + + data = [ + {"id": 101, "name": "sub item 101"}, + {"id": 101, "name": "sub item 102"}, + ] + + # we are trying to load to a table existing on the destination but not known to our internal schema + # this will fail! + with raises_step_exception( + expected_nested_error=DataValidationError, + ): + pipeline.run( + data, + table_name="test_items", + schema_contract={"tables": "freeze", "columns": "freeze", "data_type": "freeze"}, + ) From a9021fe8dec1b8b282cae112a3d3663c3914c05f Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 11 Jun 2024 15:35:06 +0200 Subject: [PATCH 18/61] Fix streamlit bug on chess example (#1425) * fix error on missing nullable hint * remove unneeded function (and unrelated formatting :) ) --- .../streamlit_app/blocks/table_hints.py | 27 +++++++-------- .../postgres_to_postgres.py | 33 ++++++++++--------- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/dlt/helpers/streamlit_app/blocks/table_hints.py b/dlt/helpers/streamlit_app/blocks/table_hints.py index e2ebcde1c9..4b0328d1dc 100644 --- a/dlt/helpers/streamlit_app/blocks/table_hints.py +++ b/dlt/helpers/streamlit_app/blocks/table_hints.py @@ -3,7 +3,7 @@ import dlt import streamlit as st -from dlt.common.schema.typing import TTableSchema +from dlt.common.schema.typing import TTableSchema, TColumnSchema from dlt.common.utils import flatten_list_or_items from dlt.helpers.streamlit_app.blocks.resource_state import resource_state_info from dlt.helpers.streamlit_app.blocks.show_data import show_data_button @@ -62,19 +62,14 @@ def list_table_hints(pipeline: dlt.Pipeline, tables: List[TTableSchema]) -> None table["resource"], ) - # table schema contains various hints (like clustering or partition options) - # that we do not want to show in basic view - def essentials_f(c: Any) -> Dict[str, Any]: - essentials: Dict[str, Any] = {} - for k, v in c.items(): - if k in ["name", "data_type", "nullable"]: - essentials[k] = v - - return { - "name": essentials["name"], - "data_type": essentials["data_type"], - "nullable": essentials["nullable"], - } - - st.table(map(essentials_f, table["columns"].values())) + st.table( + map( + lambda c: { + "name": c["name"], + "data_type": c.get("data_type"), + "nullable": c.get("nullable", True), + }, + table["columns"].values(), + ) + ) show_data_button(pipeline, table["name"]) diff --git a/docs/examples/postgres_to_postgres/postgres_to_postgres.py b/docs/examples/postgres_to_postgres/postgres_to_postgres.py index 85b8aed045..f5327ee236 100644 --- a/docs/examples/postgres_to_postgres/postgres_to_postgres.py +++ b/docs/examples/postgres_to_postgres/postgres_to_postgres.py @@ -91,16 +91,17 @@ def pg_resource_chunked( order_date: str, load_type: str = "merge", columns: str = "*", - credentials: ConnectionStringCredentials = dlt.secrets[ - "sources.postgres.credentials" - ], + credentials: ConnectionStringCredentials = dlt.secrets["sources.postgres.credentials"], ): print( f"dlt.resource write_disposition: `{load_type}` -- ", - f"connection string: postgresql://{credentials.username}:*****@{credentials.host}:{credentials.host}/{credentials.database}", + "connection string:" + f" postgresql://{credentials.username}:*****@{credentials.host}:{credentials.host}/{credentials.database}", ) - query = f"SELECT {columns} FROM {schema_name}.{table_name} ORDER BY {order_date}" # Needed to have an idempotent query + query = ( # Needed to have an idempotent query + f"SELECT {columns} FROM {schema_name}.{table_name} ORDER BY {order_date}" + ) source = dlt.resource( # type: ignore name=table_name, @@ -133,9 +134,7 @@ def table_desc(table_name, pk, schema_name, order_date, columns="*"): if __name__ == "__main__": # Input Handling - parser = argparse.ArgumentParser( - description="Run specific functions in the script." - ) + parser = argparse.ArgumentParser(description="Run specific functions in the script.") parser.add_argument("--replace", action="store_true", help="Run initial load") parser.add_argument("--merge", action="store_true", help="Run delta load") args = parser.parse_args() @@ -233,20 +232,26 @@ def table_desc(table_name, pk, schema_name, order_date, columns="*"): ).fetchone()[0] print(f"timestamped_schema: {timestamped_schema}") - target_credentials = ConnectionStringCredentials(dlt.secrets["destination.postgres.credentials"]) + target_credentials = ConnectionStringCredentials( + dlt.secrets["destination.postgres.credentials"] + ) # connect to destination (timestamped schema) conn.sql( - f"ATTACH 'dbname={target_credentials.database} user={target_credentials.username} password={target_credentials.password} host={target_credentials.host} port={target_credentials.port}' AS pg_db (TYPE postgres);" + "ATTACH" + f" 'dbname={target_credentials.database} user={target_credentials.username} password={target_credentials.password} host={target_credentials.host} port={target_credentials.port}'" + " AS pg_db (TYPE postgres);" ) conn.sql(f"CREATE SCHEMA IF NOT EXISTS pg_db.{timestamped_schema};") for table in tables: print( - f"LOAD DuckDB -> Postgres: table: {timestamped_schema}.{table['table_name']} TO Postgres {timestamped_schema}.{table['table_name']}" + f"LOAD DuckDB -> Postgres: table: {timestamped_schema}.{table['table_name']} TO" + f" Postgres {timestamped_schema}.{table['table_name']}" ) conn.sql( - f"CREATE OR REPLACE TABLE pg_db.{timestamped_schema}.{table['table_name']} AS SELECT * FROM {timestamped_schema}.{table['table_name']};" + f"CREATE OR REPLACE TABLE pg_db.{timestamped_schema}.{table['table_name']} AS" + f" SELECT * FROM {timestamped_schema}.{table['table_name']};" ) conn.sql( f"SELECT count(*) as count FROM pg_db.{timestamped_schema}.{table['table_name']};" @@ -262,9 +267,7 @@ def table_desc(table_name, pk, schema_name, order_date, columns="*"): assert int(rows) == 9 # 5. Cleanup and rename Schema - print( - "##################################### RENAME Schema and CLEANUP ########" - ) + print("##################################### RENAME Schema and CLEANUP ########") try: con_hd = psycopg2.connect( dbname=target_credentials.database, From d4340d830cd9283963880700951e2c501dea355a Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 11 Jun 2024 10:30:58 -0400 Subject: [PATCH 19/61] Fix databricks pandas error (#1443) * update dependencies for databricks/dbt * use kwargs if args not defined, fix typing * Revert to use inline params to keep support for 13.x cluster * Typing fix * adds dbt support for mssql * converts dbt deps from extra to group, allows databricks client >2.9.3 * fixes dict to env util * limits dbt version to <1.8 in destination tests * skips chess dbt package for mssql --------- Co-authored-by: Oon Tong Tan Co-authored-by: Marcin Rudolf --- .github/workflows/lint.yml | 2 +- .github/workflows/test_dbt_runner.yml | 2 +- Makefile | 2 +- dlt/common/configuration/utils.py | 12 +- .../impl/databricks/databricks.py | 2 +- .../impl/databricks/sql_client.py | 26 +- dlt/extract/decorators.py | 1 - dlt/helpers/dbt/profiles.yml | 19 +- .../chess/dbt_transform/models/load_ids.sql | 2 +- .../docs/dlt-ecosystem/destinations/mssql.md | 2 +- poetry.lock | 383 ++++++++---------- pyproject.toml | 25 +- .../configuration/test_configuration.py | 17 +- tests/load/pipeline/test_dbt_helper.py | 14 +- tests/load/utils.py | 5 +- 15 files changed, 262 insertions(+), 252 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index d6b1639685..45e4766b6c 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -59,7 +59,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --all-extras --with airflow,providers,pipeline,sentry-sdk + run: poetry install --all-extras --with airflow,providers,pipeline,sentry-sdk,dbt - name: Run make lint run: | diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index 85cb98a040..13810fbc0d 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -60,7 +60,7 @@ jobs: - name: Install dependencies # install dlt with postgres support - run: poetry install --no-interaction -E postgres -E dbt --with sentry-sdk + run: poetry install --no-interaction -E postgres --with sentry-sdk,dbt - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/Makefile b/Makefile index 92985ee82f..fd0920d188 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with airflow --with docs --with providers --with pipeline --with sentry-sdk + poetry install --all-extras --with airflow,docs,providers,pipeline,sentry-sdk,dbt lint: ./tools/check-package.sh diff --git a/dlt/common/configuration/utils.py b/dlt/common/configuration/utils.py index 8f3c1789ce..6402afcfbe 100644 --- a/dlt/common/configuration/utils.py +++ b/dlt/common/configuration/utils.py @@ -178,7 +178,10 @@ def add_config_to_env(config: BaseConfiguration, sections: Tuple[str, ...] = ()) def add_config_dict_to_env( - dict_: Mapping[str, Any], sections: Tuple[str, ...] = (), overwrite_keys: bool = False + dict_: Mapping[str, Any], + sections: Tuple[str, ...] = (), + overwrite_keys: bool = False, + destructure_dicts: bool = True, ) -> None: """Writes values in dict_ back into environment using the naming convention of EnvironProvider. Applies `sections` if specified. Does not overwrite existing keys by default""" for k, v in dict_.items(): @@ -193,5 +196,12 @@ def add_config_dict_to_env( if env_key not in os.environ or overwrite_keys: if v is None: os.environ.pop(env_key, None) + elif isinstance(v, dict) and destructure_dicts: + add_config_dict_to_env( + v, + sections + (k,), + overwrite_keys=overwrite_keys, + destructure_dicts=destructure_dicts, + ) else: os.environ[env_key] = serialize_value(v) diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 0008599349..cd203e7e4d 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -264,7 +264,7 @@ def __init__(self, schema: Schema, config: DatabricksClientConfiguration) -> Non sql_client = DatabricksSqlClient(config.normalize_dataset_name(schema), config.credentials) super().__init__(schema, config, sql_client) self.config: DatabricksClientConfiguration = config - self.sql_client: DatabricksSqlClient = sql_client + self.sql_client: DatabricksSqlClient = sql_client # type: ignore[assignment] self.type_mapper = DatabricksTypeMapper(self.capabilities) def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 7f0ee2b5e6..530b03715a 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -1,6 +1,7 @@ from contextlib import contextmanager, suppress from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence, List, Union, Dict + from databricks import sql as databricks_lib from databricks.sql.client import ( Connection as DatabricksSqlConnection, @@ -37,7 +38,9 @@ def __init__(self, dataset_name: str, credentials: DatabricksCredentials) -> Non def open_connection(self) -> DatabricksSqlConnection: conn_params = self.credentials.to_connector_params() - self._conn = databricks_lib.connect(**conn_params, schema=self.dataset_name) + self._conn = databricks_lib.connect( + **conn_params, schema=self.dataset_name, use_inline_params="silent" + ) return self._conn @raise_open_connection_error @@ -87,12 +90,14 @@ def execute_sql( @contextmanager @raise_database_error def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DBApiCursor]: - curr: DBApiCursor = None - # TODO: databricks connector 3.0.0 will use :named paramstyle only + curr: DBApiCursor + # TODO: Inline param support will be dropped in future databricks driver, switch to :named paramstyle + # This will drop support for cluster runtime v13.x + # db_args: Optional[Dict[str, Any]] # if args: # keys = [f"arg{i}" for i in range(len(args))] # # Replace position arguments (%s) with named arguments (:arg0, :arg1, ...) - # # query = query % tuple(f":{key}" for key in keys) + # query = query % tuple(f":{key}" for key in keys) # db_args = {} # for key, db_arg in zip(keys, args): # # Databricks connector doesn't accept pendulum objects @@ -102,15 +107,10 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB # db_arg = to_py_date(db_arg) # db_args[key] = db_arg # else: - # db_args = None - db_args: Optional[Union[Dict[str, Any], Sequence[Any]]] - if kwargs: - db_args = kwargs - elif args: - db_args = args - else: - db_args = None - with self._conn.cursor() as curr: + # db_args = kwargs or None + + db_args = args or kwargs or None + with self._conn.cursor() as curr: # type: ignore[assignment] curr.execute(query, db_args) yield DBApiCursorImpl(curr) # type: ignore[abstract] diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index a7246b6832..2bb4a3ce87 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -509,7 +509,6 @@ def decorator( SPEC, resolvable_fields = spec_from_signature( f, inspect.signature(f), include_defaults=standalone ) - print(SPEC, resolvable_fields, standalone) if is_inner_resource and not standalone: if len(resolvable_fields) > 0: # prevent required arguments to inner functions that are not standalone diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml index d82eb0f2fa..a2a0014e4e 100644 --- a/dlt/helpers/dbt/profiles.yml +++ b/dlt/helpers/dbt/profiles.yml @@ -144,6 +144,23 @@ athena: work_group: "{{ env_var('DLT__ATHENA_WORK_GROUP', '') }}" +mssql: + target: analytics + outputs: + analytics: + type: sqlserver + driver: "{{ env_var('DLT__CREDENTIALS__DRIVER') }}" + server: "{{ env_var('DLT__CREDENTIALS__HOST') }}" + port: "{{ env_var('DLT__CREDENTIALS__PORT') | as_number }}" + database: "{{ env_var('DLT__CREDENTIALS__DATABASE') }}" + schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" + user: "{{ env_var('DLT__CREDENTIALS__USERNAME') }}" + password: "{{ env_var('DLT__CREDENTIALS__PASSWORD') }}" + login_timeout: "{{ env_var('DLT__CREDENTIALS__CONNECT_TIMEOUT', '0') | as_number }}" + encrypt: "{{ (env_var('DLT__CREDENTIALS__QUERY__ENCRYPT', 'No') == 'yes') | as_bool }}" + trust_cert: "{{ (env_var('DLT__CREDENTIALS__QUERY__TRUSTSERVERCERTIFICATE', 'yes') == 'yes') | as_bool }}" + + # commented out because dbt for Synapse isn't currently properly supported. # Leave config here for potential future use. # synapse: @@ -157,7 +174,7 @@ athena: # database: "{{ env_var('DLT__CREDENTIALS__DATABASE') }}" # schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" # user: "{{ env_var('DLT__CREDENTIALS__USERNAME') }}" -# password: "{{ env_var('DLT__CREDENTIALS__PASSWORD') }}" +# password: "{{ env_var('DLT__CREDENTIALS__PASSWORD') }}" databricks: diff --git a/docs/examples/chess/dbt_transform/models/load_ids.sql b/docs/examples/chess/dbt_transform/models/load_ids.sql index 005449894f..06616de200 100644 --- a/docs/examples/chess/dbt_transform/models/load_ids.sql +++ b/docs/examples/chess/dbt_transform/models/load_ids.sql @@ -12,7 +12,7 @@ {% else %} -- take only loads with status = 0 and no other records SELECT load_id, schema_name, schema_version_hash FROM {{ source('dlt', '_dlt_loads') }} - GROUP BY 1, 2, 3 + GROUP BY load_id, schema_name, schema_version_hash -- note that it is a hack - we make sure no other statuses exist HAVING SUM(status) = 0 {% endif %} \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 4a6f3d61df..6aac877d7b 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -141,7 +141,7 @@ destination.mssql.credentials="mssql://loader:@loader.database.windows ``` ### dbt support -No dbt support yet. +This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-snowflake](https://github.com/dbt-msft/dbt-sqlserver). diff --git a/poetry.lock b/poetry.lock index e61d505a4a..31c9fd08ce 100644 --- a/poetry.lock +++ b/poetry.lock @@ -678,7 +678,7 @@ tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"] name = "asn1crypto" version = "1.5.1" description = "Fast ASN.1 parser and serializer with definitions for private keys, public keys, certificates, CRL, OCSP, CMS, PKCS#3, PKCS#7, PKCS#8, PKCS#12, PKCS#5, X.509 and TSP" -optional = true +optional = false python-versions = "*" files = [ {file = "asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67"}, @@ -779,7 +779,7 @@ cryptography = ">=3.2" name = "azure-core" version = "1.29.3" description = "Microsoft Azure Core Library for Python" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "azure-core-1.29.3.tar.gz", hash = "sha256:c92700af982e71c8c73de9f4c20da8b3f03ce2c22d13066e4d416b4629c87903"}, @@ -814,7 +814,7 @@ requests = ">=2.20.0" name = "azure-identity" version = "1.14.0" description = "Microsoft Azure Identity Library for Python" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "azure-identity-1.14.0.zip", hash = "sha256:72441799f8c5c89bfe21026965e266672a7c5d050c2c65119ef899dd5362e2b1"}, @@ -927,7 +927,7 @@ yaml = ["PyYAML"] name = "beautifulsoup4" version = "4.12.2" description = "Screen-scraping library" -optional = true +optional = false python-versions = ">=3.6.0" files = [ {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, @@ -1018,7 +1018,7 @@ files = [ name = "boto3" version = "1.34.34" description = "The AWS SDK for Python" -optional = true +optional = false python-versions = ">= 3.8" files = [ {file = "boto3-1.34.34-py3-none-any.whl", hash = "sha256:33a8b6d9136fa7427160edb92d2e50f2035f04e9d63a2d1027349053e12626aa"}, @@ -1417,7 +1417,7 @@ xray = ["mypy-boto3-xray (>=1.28.0,<1.29.0)"] name = "botocore" version = "1.34.34" description = "Low-level, data-driven core of boto 3." -optional = true +optional = false python-versions = ">= 3.8" files = [ {file = "botocore-1.34.34-py3-none-any.whl", hash = "sha256:cd060b0d88ebb2b893f1411c1db7f2ba66cc18e52dcc57ad029564ef5fec437b"}, @@ -2100,60 +2100,40 @@ nr-date = ">=2.0.0,<3.0.0" typeapi = ">=2.0.1,<3.0.0" typing-extensions = ">=3.10.0" -[[package]] -name = "databricks-sdk" -version = "0.17.0" -description = "Databricks SDK for Python (Beta)" -optional = true -python-versions = ">=3.7" -files = [ - {file = "databricks-sdk-0.17.0.tar.gz", hash = "sha256:0a1baa6783aba9b034b9a017da8d0cf839ec61ae8318792b78bfb3db0374dd9c"}, - {file = "databricks_sdk-0.17.0-py3-none-any.whl", hash = "sha256:ad90e01c7b1a9d60a3de6a35606c79ac982e8972d3ad3ff89c251c24439c8bb9"}, -] - -[package.dependencies] -google-auth = ">=2.0,<3.0" -requests = ">=2.28.1,<3" - -[package.extras] -dev = ["autoflake", "ipython", "ipywidgets", "isort", "pycodestyle", "pyfakefs", "pytest", "pytest-cov", "pytest-mock", "pytest-xdist", "requests-mock", "wheel", "yapf"] -notebook = ["ipython (>=8,<9)", "ipywidgets (>=8,<9)"] - [[package]] name = "databricks-sql-connector" -version = "2.9.3" +version = "3.1.2" description = "Databricks SQL Connector for Python" optional = true -python-versions = ">=3.7.1,<4.0.0" +python-versions = "<4.0.0,>=3.8.0" files = [ - {file = "databricks_sql_connector-2.9.3-py3-none-any.whl", hash = "sha256:e37b5aa8bea22e84a9920e87ad9ba6cafbe656008c180a790baa53b711dd9889"}, - {file = "databricks_sql_connector-2.9.3.tar.gz", hash = "sha256:09a1686de3470091e78640de276053d4e18f8c03ba3627ed45b368f78bf87db9"}, + {file = "databricks_sql_connector-3.1.2-py3-none-any.whl", hash = "sha256:5292bc25b4d8d58d301079b55086331764f067e24862c9365698b2eeddedb737"}, + {file = "databricks_sql_connector-3.1.2.tar.gz", hash = "sha256:da0df114e0824d49ccfea36c4679c95689fe359191b056ad516446a058307c37"}, ] [package.dependencies] -alembic = ">=1.0.11,<2.0.0" lz4 = ">=4.0.2,<5.0.0" numpy = [ - {version = ">=1.16.6", markers = "python_version >= \"3.7\" and python_version < \"3.11\""}, + {version = ">=1.16.6", markers = "python_version >= \"3.8\" and python_version < \"3.11\""}, {version = ">=1.23.4", markers = "python_version >= \"3.11\""}, ] oauthlib = ">=3.1.0,<4.0.0" openpyxl = ">=3.0.10,<4.0.0" -pandas = {version = ">=1.2.5,<3.0.0", markers = "python_version >= \"3.8\""} -pyarrow = [ - {version = ">=6.0.0", markers = "python_version >= \"3.7\" and python_version < \"3.11\""}, - {version = ">=10.0.1", markers = "python_version >= \"3.11\""}, -] +pandas = {version = ">=1.2.5,<2.2.0", markers = "python_version >= \"3.8\""} +pyarrow = ">=14.0.1,<15.0.0" requests = ">=2.18.1,<3.0.0" -sqlalchemy = ">=1.3.24,<2.0.0" thrift = ">=0.16.0,<0.17.0" -urllib3 = ">=1.0" +urllib3 = ">=1.26" + +[package.extras] +alembic = ["alembic (>=1.0.11,<2.0.0)", "sqlalchemy (>=2.0.21)"] +sqlalchemy = ["sqlalchemy (>=2.0.21)"] [[package]] name = "dbt-athena-community" version = "1.7.1" description = "The athena adapter plugin for dbt (data build tool)" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "dbt-athena-community-1.7.1.tar.gz", hash = "sha256:02c7bc461628e2adbfaf9d3f51fbe9a5cb5e06ee2ea8329259758518ceafdc12"}, @@ -2173,7 +2153,7 @@ tenacity = ">=8.2,<9.0" name = "dbt-bigquery" version = "1.7.2" description = "The Bigquery adapter plugin for dbt" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "dbt-bigquery-1.7.2.tar.gz", hash = "sha256:27c7f492f65ab5d1d43432a4467a436fc3637e3cb72c5b4ab07ddf7573c43596"}, @@ -2223,23 +2203,6 @@ sqlparse = ">=0.2.3,<0.5" typing-extensions = ">=3.7.4" urllib3 = ">=1.0,<2.0" -[[package]] -name = "dbt-databricks" -version = "1.7.3" -description = "The Databricks adapter plugin for dbt" -optional = true -python-versions = ">=3.8" -files = [ - {file = "dbt-databricks-1.7.3.tar.gz", hash = "sha256:045e26240c825342259a59004c2e35e7773b0b6cbb255e6896bd46d3810f9607"}, - {file = "dbt_databricks-1.7.3-py3-none-any.whl", hash = "sha256:7c2b7bd7228a401d8262781749fc496c825fe6050e661e5ab3f1c66343e311cc"}, -] - -[package.dependencies] -databricks-sdk = ">=0.9.0" -databricks-sql-connector = ">=2.9.3,<3.0.0" -dbt-spark = "1.7.1" -keyring = ">=23.13.0" - [[package]] name = "dbt-duckdb" version = "1.7.1" @@ -2283,11 +2246,27 @@ files = [ {file = "dbt_extractor-0.5.1.tar.gz", hash = "sha256:cd5d95576a8dea4190240aaf9936a37fd74b4b7913ca69a3c368fc4472bb7e13"}, ] +[[package]] +name = "dbt-fabric" +version = "1.7.4" +description = "A Microsoft Fabric Synapse Data Warehouse adapter plugin for dbt" +optional = false +python-versions = "*" +files = [ + {file = "dbt-fabric-1.7.4.tar.gz", hash = "sha256:6f17f0ba683c2944c8f846589dca4b54106579af32ac5acafe701d1becc2496f"}, + {file = "dbt_fabric-1.7.4-py3-none-any.whl", hash = "sha256:e59dfe36ca8d3e47e7c22280d56e04da3529f6fc3c7855dab6fcef2d8f820f7e"}, +] + +[package.dependencies] +azure-identity = ">=1.12.0" +dbt-core = ">=1.7.2,<1.8.0" +pyodbc = ">=4.0.35,<5.1.0" + [[package]] name = "dbt-postgres" version = "1.7.4" description = "The postgres adapter plugin for dbt (data build tool)" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "dbt-postgres-1.7.4.tar.gz", hash = "sha256:16185b8de36d1a2052a2e4b85512306ab55085b1ea323a353d0dc3628473208d"}, @@ -2303,7 +2282,7 @@ psycopg2-binary = ">=2.8,<3.0" name = "dbt-redshift" version = "1.7.1" description = "The Redshift adapter plugin for dbt" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "dbt-redshift-1.7.1.tar.gz", hash = "sha256:6da69a83038d011570d131b85171842d0858a46bca3757419ae193b5724a2119"}, @@ -2342,7 +2321,7 @@ typing-extensions = ">=4.4,<5.0" name = "dbt-snowflake" version = "1.7.1" description = "The Snowflake adapter plugin for dbt" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "dbt-snowflake-1.7.1.tar.gz", hash = "sha256:842a9e87b9e2d999e3bc27aaa369398a4d02bb3f8bb7447aa6151204d4eb90f0"}, @@ -2355,25 +2334,21 @@ dbt-core = ">=1.7.0,<1.8.0" snowflake-connector-python = {version = ">=3.0,<4.0", extras = ["secure-local-storage"]} [[package]] -name = "dbt-spark" -version = "1.7.1" -description = "The Apache Spark adapter plugin for dbt" -optional = true -python-versions = ">=3.8" +name = "dbt-sqlserver" +version = "1.7.4" +description = "A Microsoft SQL Server adapter plugin for dbt" +optional = false +python-versions = "*" files = [ - {file = "dbt-spark-1.7.1.tar.gz", hash = "sha256:a10e5d1bfdb2ca98e7ae2badd06150e2695d9d4fa18ae2354ed5bd093d77f947"}, - {file = "dbt_spark-1.7.1-py3-none-any.whl", hash = "sha256:99b5002edcdb82058a3b0ad33eb18b91a4bdde887d94855e8bd6f633d78837dc"}, + {file = "dbt-sqlserver-1.7.4.tar.gz", hash = "sha256:b9e85771a1c00e8f4aadefb37b00d02b3d49bc93ad7c52782fd9cae9db31dd98"}, + {file = "dbt_sqlserver-1.7.4-py3-none-any.whl", hash = "sha256:7b67babca5ad5d7b9be923250e8a32ffe94dfef78ee2d3ce6bd145cc3e157a8d"}, ] [package.dependencies] -dbt-core = ">=1.7.0,<1.8.0" -sqlparams = ">=3.0.0" - -[package.extras] -all = ["PyHive[hive-pure-sasl] (>=0.7.0,<0.8.0)", "pyodbc (>=4.0.39,<4.1.0)", "pyspark (>=3.0.0,<4.0.0)", "thrift (>=0.11.0,<0.17.0)"] -odbc = ["pyodbc (>=4.0.39,<4.1.0)"] -pyhive = ["PyHive[hive-pure-sasl] (>=0.7.0,<0.8.0)", "thrift (>=0.11.0,<0.17.0)"] -session = ["pyspark (>=3.0.0,<4.0.0)"] +azure-identity = ">=1.12.0" +dbt-core = ">=1.7.2,<1.8.0" +dbt-fabric = ">=1.7.2,<1.8.0" +pyodbc = ">=4.0.35,<5.1.0" [[package]] name = "decopatch" @@ -2703,7 +2678,7 @@ tqdm = ">=4.65,<5.0" name = "filelock" version = "3.12.3" description = "A platform independent file lock." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"}, @@ -3324,7 +3299,7 @@ tool = ["click (>=6.0.0)"] name = "google-cloud-bigquery" version = "3.19.0" description = "Google BigQuery API client library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google-cloud-bigquery-3.19.0.tar.gz", hash = "sha256:8e311dae49768e1501fcdc5e916bff4b7e169471e5707919f4a6f78a02b3b5a6"}, @@ -3355,7 +3330,7 @@ tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"] name = "google-cloud-core" version = "2.3.3" description = "Google Cloud API client core library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google-cloud-core-2.3.3.tar.gz", hash = "sha256:37b80273c8d7eee1ae816b3a20ae43585ea50506cb0e60f3cf5be5f87f1373cb"}, @@ -3373,7 +3348,7 @@ grpc = ["grpcio (>=1.38.0,<2.0dev)"] name = "google-cloud-dataproc" version = "5.4.3" description = "Google Cloud Dataproc API client library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google-cloud-dataproc-5.4.3.tar.gz", hash = "sha256:d9c77c52aa5ddf52ae657736dbfb5312402933f72bab8480fc2d2afe98697402"}, @@ -3393,7 +3368,7 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4 name = "google-cloud-storage" version = "2.10.0" description = "Google Cloud Storage API client library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google-cloud-storage-2.10.0.tar.gz", hash = "sha256:934b31ead5f3994e5360f9ff5750982c5b6b11604dc072bc452c25965e076dc7"}, @@ -3414,7 +3389,7 @@ protobuf = ["protobuf (<5.0.0dev)"] name = "google-crc32c" version = "1.5.0" description = "A python wrapper of the C library 'Google CRC32C'" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google-crc32c-1.5.0.tar.gz", hash = "sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7"}, @@ -3710,7 +3685,7 @@ files = [ name = "google-resumable-media" version = "2.5.0" description = "Utilities for Google Media Downloads and Resumable Uploads" -optional = true +optional = false python-versions = ">= 3.7" files = [ {file = "google-resumable-media-2.5.0.tar.gz", hash = "sha256:218931e8e2b2a73a58eb354a288e03a0fd5fb1c4583261ac6e4c078666468c93"}, @@ -3830,7 +3805,7 @@ test = ["objgraph", "psutil"] name = "grpc-google-iam-v1" version = "0.12.6" description = "IAM API client library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "grpc-google-iam-v1-0.12.6.tar.gz", hash = "sha256:2bc4b8fdf22115a65d751c9317329322602c39b7c86a289c9b72d228d960ef5f"}, @@ -3903,7 +3878,7 @@ protobuf = ["grpcio-tools (>=1.57.0)"] name = "grpcio-status" version = "1.57.0" description = "Status proto mapping for gRPC" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "grpcio-status-1.57.0.tar.gz", hash = "sha256:b098da99df1eebe58337f8f78e50df990273ccacc1226fddeb47c590e3df9e02"}, @@ -4262,7 +4237,7 @@ files = [ name = "jaraco-classes" version = "3.3.0" description = "Utility functions for Python class constructs" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "jaraco.classes-3.3.0-py3-none-any.whl", hash = "sha256:10afa92b6743f25c0cf5f37c6bb6e18e2c5bb84a16527ccfc0040ea377e7aaeb"}, @@ -4280,7 +4255,7 @@ testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", name = "jeepney" version = "0.8.0" description = "Low-level, pure Python DBus protocol wrapper." -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"}, @@ -4326,7 +4301,7 @@ ansicon = {version = "*", markers = "platform_system == \"Windows\""} name = "jmespath" version = "1.0.1" description = "JSON Matching Expressions" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, @@ -4392,7 +4367,7 @@ referencing = ">=0.28.0" name = "keyring" version = "24.2.0" description = "Store and access your passwords safely." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "keyring-24.2.0-py3-none-any.whl", hash = "sha256:4901caaf597bfd3bbd78c9a0c7c4c29fcd8310dab2cffefe749e916b6527acd6"}, @@ -4564,7 +4539,7 @@ zmq = ["pyzmq"] name = "lxml" version = "4.9.3" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -optional = true +optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" files = [ {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, @@ -4998,7 +4973,7 @@ six = ">=1.9.0,<2.0" name = "mmh3" version = "4.0.1" description = "Python extension for MurmurHash (MurmurHash3), a set of fast and robust hash functions." -optional = true +optional = false python-versions = "*" files = [ {file = "mmh3-4.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b719ba87232749095011d567a36a25e40ed029fc61c47e74a12416d8bb60b311"}, @@ -5102,7 +5077,7 @@ tests = ["pytest (>=4.6)"] name = "msal" version = "1.23.0" description = "The Microsoft Authentication Library (MSAL) for Python library enables your app to access the Microsoft Cloud by supporting authentication of users with Microsoft Azure Active Directory accounts (AAD) and Microsoft Accounts (MSA) using industry standard OAuth2 and OpenID Connect." -optional = true +optional = false python-versions = "*" files = [ {file = "msal-1.23.0-py2.py3-none-any.whl", hash = "sha256:3342e0837a047007f9d479e814b559c3219767453d57920dc40a31986862048b"}, @@ -5121,7 +5096,7 @@ broker = ["pymsalruntime (>=0.13.2,<0.14)"] name = "msal-extensions" version = "1.0.0" description = "Microsoft Authentication Library extensions (MSAL EX) provides a persistence API that can save your data on disk, encrypted on Windows, macOS and Linux. Concurrent data access will be coordinated by a file lock mechanism." -optional = true +optional = false python-versions = "*" files = [ {file = "msal-extensions-1.0.0.tar.gz", hash = "sha256:c676aba56b0cce3783de1b5c5ecfe828db998167875126ca4b47dc6436451354"}, @@ -5341,7 +5316,7 @@ reports = ["lxml"] name = "mypy-boto3-athena" version = "1.28.36" description = "Type annotations for boto3.Athena 1.28.36 service generated with mypy-boto3-builder 7.18.0" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "mypy-boto3-athena-1.28.36.tar.gz", hash = "sha256:a76df6aace3dc1d91b3f74640d617cd1b4802e5f348a22db2f16dfce0b01ee26"}, @@ -5355,7 +5330,7 @@ typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} name = "mypy-boto3-glue" version = "1.28.36" description = "Type annotations for boto3.Glue 1.28.36 service generated with mypy-boto3-builder 7.18.0" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "mypy-boto3-glue-1.28.36.tar.gz", hash = "sha256:161771252bb6a220a0bfd8e6ad71da8548599c611f95fe8a94846f4a3386d2ae"}, @@ -5369,7 +5344,7 @@ typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} name = "mypy-boto3-lakeformation" version = "1.28.36" description = "Type annotations for boto3.LakeFormation 1.28.36 service generated with mypy-boto3-builder 7.18.0" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "mypy-boto3-lakeformation-1.28.36.tar.gz", hash = "sha256:9327cf0d28a09abf5bd90ae946ce7420b32a3b979a1a3554ac93716c3dceacb0"}, @@ -5383,7 +5358,7 @@ typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} name = "mypy-boto3-sts" version = "1.28.37" description = "Type annotations for boto3.STS 1.28.37 service generated with mypy-boto3-builder 7.18.2" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "mypy-boto3-sts-1.28.37.tar.gz", hash = "sha256:54d64ca695ab90a51c68ac1e67ff9eae7ec69f926649e320a3b90ed1ec841a95"}, @@ -5911,8 +5886,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -5943,71 +5918,67 @@ xml = ["lxml (>=4.6.3)"] [[package]] name = "pandas" -version = "2.2.0" +version = "2.1.4" description = "Powerful data structures for data analysis, time series, and statistics" optional = false python-versions = ">=3.9" files = [ - {file = "pandas-2.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8108ee1712bb4fa2c16981fba7e68b3f6ea330277f5ca34fa8d557e986a11670"}, - {file = "pandas-2.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:736da9ad4033aeab51d067fc3bd69a0ba36f5a60f66a527b3d72e2030e63280a"}, - {file = "pandas-2.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38e0b4fc3ddceb56ec8a287313bc22abe17ab0eb184069f08fc6a9352a769b18"}, - {file = "pandas-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20404d2adefe92aed3b38da41d0847a143a09be982a31b85bc7dd565bdba0f4e"}, - {file = "pandas-2.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7ea3ee3f125032bfcade3a4cf85131ed064b4f8dd23e5ce6fa16473e48ebcaf5"}, - {file = "pandas-2.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f9670b3ac00a387620489dfc1bca66db47a787f4e55911f1293063a78b108df1"}, - {file = "pandas-2.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a946f210383c7e6d16312d30b238fd508d80d927014f3b33fb5b15c2f895430"}, - {file = "pandas-2.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a1b438fa26b208005c997e78672f1aa8138f67002e833312e6230f3e57fa87d5"}, - {file = "pandas-2.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8ce2fbc8d9bf303ce54a476116165220a1fedf15985b09656b4b4275300e920b"}, - {file = "pandas-2.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2707514a7bec41a4ab81f2ccce8b382961a29fbe9492eab1305bb075b2b1ff4f"}, - {file = "pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85793cbdc2d5bc32620dc8ffa715423f0c680dacacf55056ba13454a5be5de88"}, - {file = "pandas-2.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:cfd6c2491dc821b10c716ad6776e7ab311f7df5d16038d0b7458bc0b67dc10f3"}, - {file = "pandas-2.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a146b9dcacc3123aa2b399df1a284de5f46287a4ab4fbfc237eac98a92ebcb71"}, - {file = "pandas-2.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbc1b53c0e1fdf16388c33c3cca160f798d38aea2978004dd3f4d3dec56454c9"}, - {file = "pandas-2.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a41d06f308a024981dcaa6c41f2f2be46a6b186b902c94c2674e8cb5c42985bc"}, - {file = "pandas-2.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:159205c99d7a5ce89ecfc37cb08ed179de7783737cea403b295b5eda8e9c56d1"}, - {file = "pandas-2.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1e1f3861ea9132b32f2133788f3b14911b68102d562715d71bd0013bc45440"}, - {file = "pandas-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:761cb99b42a69005dec2b08854fb1d4888fdf7b05db23a8c5a099e4b886a2106"}, - {file = "pandas-2.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a20628faaf444da122b2a64b1e5360cde100ee6283ae8effa0d8745153809a2e"}, - {file = "pandas-2.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f5be5d03ea2073627e7111f61b9f1f0d9625dc3c4d8dda72cc827b0c58a1d042"}, - {file = "pandas-2.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:a626795722d893ed6aacb64d2401d017ddc8a2341b49e0384ab9bf7112bdec30"}, - {file = "pandas-2.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9f66419d4a41132eb7e9a73dcec9486cf5019f52d90dd35547af11bc58f8637d"}, - {file = "pandas-2.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:57abcaeda83fb80d447f28ab0cc7b32b13978f6f733875ebd1ed14f8fbc0f4ab"}, - {file = "pandas-2.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e60f1f7dba3c2d5ca159e18c46a34e7ca7247a73b5dd1a22b6d59707ed6b899a"}, - {file = "pandas-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb61dc8567b798b969bcc1fc964788f5a68214d333cade8319c7ab33e2b5d88a"}, - {file = "pandas-2.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:52826b5f4ed658fa2b729264d63f6732b8b29949c7fd234510d57c61dbeadfcd"}, - {file = "pandas-2.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bde2bc699dbd80d7bc7f9cab1e23a95c4375de615860ca089f34e7c64f4a8de7"}, - {file = "pandas-2.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:3de918a754bbf2da2381e8a3dcc45eede8cd7775b047b923f9006d5f876802ae"}, - {file = "pandas-2.2.0.tar.gz", hash = "sha256:30b83f7c3eb217fb4d1b494a57a2fda5444f17834f5df2de6b2ffff68dc3c8e2"}, + {file = "pandas-2.1.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bdec823dc6ec53f7a6339a0e34c68b144a7a1fd28d80c260534c39c62c5bf8c9"}, + {file = "pandas-2.1.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:294d96cfaf28d688f30c918a765ea2ae2e0e71d3536754f4b6de0ea4a496d034"}, + {file = "pandas-2.1.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b728fb8deba8905b319f96447a27033969f3ea1fea09d07d296c9030ab2ed1d"}, + {file = "pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00028e6737c594feac3c2df15636d73ace46b8314d236100b57ed7e4b9ebe8d9"}, + {file = "pandas-2.1.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:426dc0f1b187523c4db06f96fb5c8d1a845e259c99bda74f7de97bd8a3bb3139"}, + {file = "pandas-2.1.4-cp310-cp310-win_amd64.whl", hash = "sha256:f237e6ca6421265643608813ce9793610ad09b40154a3344a088159590469e46"}, + {file = "pandas-2.1.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b7d852d16c270e4331f6f59b3e9aa23f935f5c4b0ed2d0bc77637a8890a5d092"}, + {file = "pandas-2.1.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7d5f2f54f78164b3d7a40f33bf79a74cdee72c31affec86bfcabe7e0789821"}, + {file = "pandas-2.1.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0aa6e92e639da0d6e2017d9ccff563222f4eb31e4b2c3cf32a2a392fc3103c0d"}, + {file = "pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d797591b6846b9db79e65dc2d0d48e61f7db8d10b2a9480b4e3faaddc421a171"}, + {file = "pandas-2.1.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d2d3e7b00f703aea3945995ee63375c61b2e6aa5aa7871c5d622870e5e137623"}, + {file = "pandas-2.1.4-cp311-cp311-win_amd64.whl", hash = "sha256:dc9bf7ade01143cddc0074aa6995edd05323974e6e40d9dbde081021ded8510e"}, + {file = "pandas-2.1.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:482d5076e1791777e1571f2e2d789e940dedd927325cc3cb6d0800c6304082f6"}, + {file = "pandas-2.1.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a706cfe7955c4ca59af8c7a0517370eafbd98593155b48f10f9811da440248b"}, + {file = "pandas-2.1.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0513a132a15977b4a5b89aabd304647919bc2169eac4c8536afb29c07c23540"}, + {file = "pandas-2.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9f17f2b6fc076b2a0078862547595d66244db0f41bf79fc5f64a5c4d635bead"}, + {file = "pandas-2.1.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:45d63d2a9b1b37fa6c84a68ba2422dc9ed018bdaa668c7f47566a01188ceeec1"}, + {file = "pandas-2.1.4-cp312-cp312-win_amd64.whl", hash = "sha256:f69b0c9bb174a2342818d3e2778584e18c740d56857fc5cdb944ec8bbe4082cf"}, + {file = "pandas-2.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3f06bda01a143020bad20f7a85dd5f4a1600112145f126bc9e3e42077c24ef34"}, + {file = "pandas-2.1.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab5796839eb1fd62a39eec2916d3e979ec3130509930fea17fe6f81e18108f6a"}, + {file = "pandas-2.1.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edbaf9e8d3a63a9276d707b4d25930a262341bca9874fcb22eff5e3da5394732"}, + {file = "pandas-2.1.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ebfd771110b50055712b3b711b51bee5d50135429364d0498e1213a7adc2be8"}, + {file = "pandas-2.1.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8ea107e0be2aba1da619cc6ba3f999b2bfc9669a83554b1904ce3dd9507f0860"}, + {file = "pandas-2.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:d65148b14788b3758daf57bf42725caa536575da2b64df9964c563b015230984"}, + {file = "pandas-2.1.4.tar.gz", hash = "sha256:fcb68203c833cc735321512e13861358079a96c174a61f5116a1de89c58c0ef7"}, ] [package.dependencies] numpy = {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""} python-dateutil = ">=2.8.2" pytz = ">=2020.1" -tzdata = ">=2022.7" +tzdata = ">=2022.1" [package.extras] -all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] -aws = ["s3fs (>=2022.11.0)"] -clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] -compression = ["zstandard (>=0.19.0)"] -computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] +aws = ["s3fs (>=2022.05.0)"] +clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"] +compression = ["zstandard (>=0.17.0)"] +computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"] consortium-standard = ["dataframe-api-compat (>=0.1.7)"] -excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] -feather = ["pyarrow (>=10.0.1)"] -fss = ["fsspec (>=2022.11.0)"] -gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] -hdf5 = ["tables (>=3.8.0)"] -html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] -mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] -output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] -parquet = ["pyarrow (>=10.0.1)"] -performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] -plot = ["matplotlib (>=3.6.3)"] -postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] -spss = ["pyreadstat (>=1.2.0)"] -sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2022.05.0)"] +gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"] +hdf5 = ["tables (>=3.7.0)"] +html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"] +mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"] +spss = ["pyreadstat (>=1.1.5)"] +sql-other = ["SQLAlchemy (>=1.4.36)"] test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] -xml = ["lxml (>=4.9.2)"] +xml = ["lxml (>=4.8.0)"] [[package]] name = "parsedatetime" @@ -6232,7 +6203,7 @@ files = [ name = "portalocker" version = "2.7.0" description = "Wraps the portalocker recipe for easy usage" -optional = true +optional = false python-versions = ">=3.5" files = [ {file = "portalocker-2.7.0-py2.py3-none-any.whl", hash = "sha256:a07c5b4f3985c3cf4798369631fb7011adb498e2a46d8440efc75a8f29a0f983"}, @@ -6279,7 +6250,7 @@ dev = ["nose", "pipreqs", "twine"] name = "proto-plus" version = "1.22.3" description = "Beautiful, Pythonic protocol buffers." -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "proto-plus-1.22.3.tar.gz", hash = "sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b"}, @@ -6436,47 +6407,47 @@ files = [ [[package]] name = "pyarrow" -version = "16.1.0" +version = "14.0.2" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.8" files = [ - {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"}, - {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"}, - {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"}, - {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"}, - {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"}, - {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"}, - {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"}, - {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"}, - {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"}, - {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"}, - {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"}, - {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"}, - {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"}, - {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"}, - {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"}, - {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"}, - {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"}, - {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"}, - {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"}, - {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"}, - {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"}, + {file = "pyarrow-14.0.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:ba9fe808596c5dbd08b3aeffe901e5f81095baaa28e7d5118e01354c64f22807"}, + {file = "pyarrow-14.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:22a768987a16bb46220cef490c56c671993fbee8fd0475febac0b3e16b00a10e"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dbba05e98f247f17e64303eb876f4a80fcd32f73c7e9ad975a83834d81f3fda"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a898d134d00b1eca04998e9d286e19653f9d0fcb99587310cd10270907452a6b"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:87e879323f256cb04267bb365add7208f302df942eb943c93a9dfeb8f44840b1"}, + {file = "pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:76fc257559404ea5f1306ea9a3ff0541bf996ff3f7b9209fc517b5e83811fa8e"}, + {file = "pyarrow-14.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0c4a18e00f3a32398a7f31da47fefcd7a927545b396e1f15d0c85c2f2c778cd"}, + {file = "pyarrow-14.0.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:87482af32e5a0c0cce2d12eb3c039dd1d853bd905b04f3f953f147c7a196915b"}, + {file = "pyarrow-14.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:059bd8f12a70519e46cd64e1ba40e97eae55e0cbe1695edd95384653d7626b23"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f16111f9ab27e60b391c5f6d197510e3ad6654e73857b4e394861fc79c37200"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06ff1264fe4448e8d02073f5ce45a9f934c0f3db0a04460d0b01ff28befc3696"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd4f4b472ccf4042f1eab77e6c8bce574543f54d2135c7e396f413046397d5a"}, + {file = "pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:32356bfb58b36059773f49e4e214996888eeea3a08893e7dbde44753799b2a02"}, + {file = "pyarrow-14.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:52809ee69d4dbf2241c0e4366d949ba035cbcf48409bf404f071f624ed313a2b"}, + {file = "pyarrow-14.0.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:c87824a5ac52be210d32906c715f4ed7053d0180c1060ae3ff9b7e560f53f944"}, + {file = "pyarrow-14.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a25eb2421a58e861f6ca91f43339d215476f4fe159eca603c55950c14f378cc5"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c1da70d668af5620b8ba0a23f229030a4cd6c5f24a616a146f30d2386fec422"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cc61593c8e66194c7cdfae594503e91b926a228fba40b5cf25cc593563bcd07"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:78ea56f62fb7c0ae8ecb9afdd7893e3a7dbeb0b04106f5c08dbb23f9c0157591"}, + {file = "pyarrow-14.0.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:37c233ddbce0c67a76c0985612fef27c0c92aef9413cf5aa56952f359fcb7379"}, + {file = "pyarrow-14.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:e4b123ad0f6add92de898214d404e488167b87b5dd86e9a434126bc2b7a5578d"}, + {file = "pyarrow-14.0.2-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e354fba8490de258be7687f341bc04aba181fc8aa1f71e4584f9890d9cb2dec2"}, + {file = "pyarrow-14.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:20e003a23a13da963f43e2b432483fdd8c38dc8882cd145f09f21792e1cf22a1"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0de7575e841f1595ac07e5bc631084fd06ca8b03c0f2ecece733d23cd5102a"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66e986dc859712acb0bd45601229021f3ffcdfc49044b64c6d071aaf4fa49e98"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:f7d029f20ef56673a9730766023459ece397a05001f4e4d13805111d7c2108c0"}, + {file = "pyarrow-14.0.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:209bac546942b0d8edc8debda248364f7f668e4aad4741bae58e67d40e5fcf75"}, + {file = "pyarrow-14.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:1e6987c5274fb87d66bb36816afb6f65707546b3c45c44c28e3c4133c010a881"}, + {file = "pyarrow-14.0.2-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a01d0052d2a294a5f56cc1862933014e696aa08cc7b620e8c0cce5a5d362e976"}, + {file = "pyarrow-14.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a51fee3a7db4d37f8cda3ea96f32530620d43b0489d169b285d774da48ca9785"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64df2bf1ef2ef14cee531e2dfe03dd924017650ffaa6f9513d7a1bb291e59c15"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c0fa3bfdb0305ffe09810f9d3e2e50a2787e3a07063001dcd7adae0cee3601a"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c65bf4fd06584f058420238bc47a316e80dda01ec0dfb3044594128a6c2db794"}, + {file = "pyarrow-14.0.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:63ac901baec9369d6aae1cbe6cca11178fb018a8d45068aaf5bb54f94804a866"}, + {file = "pyarrow-14.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:75ee0efe7a87a687ae303d63037d08a48ef9ea0127064df18267252cfe2e9541"}, + {file = "pyarrow-14.0.2.tar.gz", hash = "sha256:36cef6ba12b499d864d1def3e990f97949e0b79400d08b7cf74504ffbd3eb025"}, ] [package.dependencies] @@ -6522,7 +6493,7 @@ pyasn1 = ">=0.4.6,<0.6.0" name = "pyathena" version = "3.0.6" description = "Python DB API 2.0 (PEP 249) client for Amazon Athena" -optional = true +optional = false python-versions = ">=3.8.1" files = [ {file = "pyathena-3.0.6-py3-none-any.whl", hash = "sha256:27fb606a73644e62be8ef9b86cdf583ab3cb9f8cac9c2ad8f05b7ad6d4eaaa87"}, @@ -6889,7 +6860,7 @@ rsa = ["cryptography"] name = "pyodbc" version = "4.0.39" description = "DB API Module for ODBC" -optional = true +optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ {file = "pyodbc-4.0.39-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:74af348dbaee4885998858daf50c8964e767629ecf6c195868b016367b0bb861"}, @@ -6933,7 +6904,7 @@ files = [ name = "pyopenssl" version = "23.2.0" description = "Python wrapper module around the OpenSSL library" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "pyOpenSSL-23.2.0-py3-none-any.whl", hash = "sha256:24f0dc5227396b3e831f4c7f602b950a5e9833d292c8e4a2e06b709292806ae2"}, @@ -7203,7 +7174,7 @@ files = [ name = "pywin32" version = "306" description = "Python for Window Extensions" -optional = true +optional = false python-versions = "*" files = [ {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, @@ -7226,7 +7197,7 @@ files = [ name = "pywin32-ctypes" version = "0.2.2" description = "A (partial) reimplementation of pywin32 using ctypes/cffi" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "pywin32-ctypes-0.2.2.tar.gz", hash = "sha256:3426e063bdd5fd4df74a14fa3cf80a0b42845a87e1d1e81f6549f9daec593a60"}, @@ -7324,7 +7295,7 @@ fastembed = ["fastembed (==0.1.1)"] name = "redshift-connector" version = "2.0.915" description = "Redshift interface library" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "redshift_connector-2.0.915-py3-none-any.whl", hash = "sha256:d02e8d6fa01dd46504c879953f6abd7fa72980edd1e6a80202448fe35fb4c9e4"}, @@ -7764,7 +7735,7 @@ boto3 = ["aiobotocore[boto3] (>=2.5.4,<3.0.0)"] name = "s3transfer" version = "0.10.0" description = "An Amazon S3 Transfer Manager" -optional = true +optional = false python-versions = ">= 3.8" files = [ {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"}, @@ -7781,7 +7752,7 @@ crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] name = "scramp" version = "1.4.4" description = "An implementation of the SCRAM protocol." -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "scramp-1.4.4-py3-none-any.whl", hash = "sha256:b142312df7c2977241d951318b7ee923d6b7a4f75ba0f05b621ece1ed616faa3"}, @@ -7795,7 +7766,7 @@ asn1crypto = ">=1.5.1" name = "secretstorage" version = "3.3.3" description = "Python bindings to FreeDesktop.org Secret Service API" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"}, @@ -8091,7 +8062,7 @@ files = [ name = "snowflake-connector-python" version = "3.5.0" description = "Snowflake Connector for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "snowflake-connector-python-3.5.0.tar.gz", hash = "sha256:654e4a1f68a491544bd8f7c5ab02eb8531df67c5f4309d5253bd204044f8a1b3"}, @@ -8146,7 +8117,7 @@ secure-local-storage = ["keyring (!=16.1.0,<25.0.0)"] name = "sortedcontainers" version = "2.4.0" description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" -optional = true +optional = false python-versions = "*" files = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, @@ -8157,7 +8128,7 @@ files = [ name = "soupsieve" version = "2.5" description = "A modern CSS selector implementation for Beautiful Soup." -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, @@ -8315,17 +8286,6 @@ toml = {version = "*", markers = "python_version < \"3.11\""} tqdm = "*" typing-extensions = "*" -[[package]] -name = "sqlparams" -version = "6.0.1" -description = "Convert between various DB API 2.0 parameter styles." -optional = true -python-versions = ">=3.8" -files = [ - {file = "sqlparams-6.0.1-py3-none-any.whl", hash = "sha256:566651376315c832876be4a0f58ffa23a23fab257d77ee492bdf8d301e169d0d"}, - {file = "sqlparams-6.0.1.tar.gz", hash = "sha256:032b2f949d4afbcbfa24003f6fb407f2fc8468184e3d8ca3d59ba6b30d4935bf"}, -] - [[package]] name = "sqlparse" version = "0.4.4" @@ -9302,7 +9262,6 @@ bigquery = ["gcsfs", "google-cloud-bigquery", "grpcio", "pyarrow"] cli = ["cron-descriptor", "pipdeptree"] clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyarrow", "s3fs"] databricks = ["databricks-sql-connector"] -dbt = ["dbt-athena-community", "dbt-bigquery", "dbt-core", "dbt-databricks", "dbt-duckdb", "dbt-redshift", "dbt-snowflake"] deltalake = ["deltalake", "pyarrow"] dremio = ["pyarrow"] duckdb = ["duckdb"] @@ -9323,4 +9282,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "d9034fc091a6e823373e742530d67b9c075d329afd2fee3bad7467716d2b2b9a" +content-hash = "9644e603fdf7b7ca6d177247950370b86ba1c84849deb7cfd83510086cb2e193" diff --git a/pyproject.toml b/pyproject.toml index 9086acea9b..abc87edaee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,12 +64,6 @@ duckdb = {version = ">=0.6.1,<0.11", optional = true} # {version = ">=0.6.1,<0.10.0", python = ">=3.8,<3.12", optional = true}, # {version = ">=0.10.0,<0.11.0", python = ">=3.12", optional = true} # ] -dbt-core = {version = ">=1.2.0", optional = true} -dbt-redshift = {version = ">=1.2.0", optional = true} -dbt-bigquery = {version = ">=1.2.0", optional = true} -dbt-duckdb = {version = ">=1.2.0", optional = true} -dbt-snowflake = {version = ">=1.2.0", optional = true} -dbt-athena-community = {version = ">=1.2.0", optional = true} s3fs = {version = ">=2022.4.0", optional = true} gcsfs = {version = ">=2022.4.0", optional = true} botocore = {version = ">=1.28", optional = true} @@ -81,14 +75,12 @@ weaviate-client = {version = ">=3.22", optional = true} adlfs = {version = ">=2022.4.0", optional = true} pyodbc = {version = "^4.0.39", optional = true} qdrant-client = {version = "^1.6.4", optional = true, extras = ["fastembed"]} -databricks-sql-connector = {version = ">=2.9.3,<3.0.0", optional = true} -dbt-databricks = {version = ">=1.7.3", optional = true} +databricks-sql-connector = {version = ">=3", optional = true} clickhouse-driver = { version = ">=0.2.7", optional = true } clickhouse-connect = { version = ">=0.7.7", optional = true } deltalake = { version = ">=0.17.4", optional = true } [tool.poetry.extras] -dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community", "dbt-databricks"] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] # bigquery is alias on gcp extras bigquery = ["grpcio", "google-cloud-bigquery", "pyarrow", "db-dtypes", "gcsfs"] @@ -197,6 +189,19 @@ sentry-sdk = "^1.5.6" [tool.poetry.group.docs] optional = true + +[tool.poetry.group.dbt] +optional = true +[tool.poetry.group.dbt.dependencies] +dbt-core = ">=1.5.0" +dbt-redshift = ">=1.5.0" +dbt-bigquery = ">=1.5.0" +dbt-duckdb = ">=1.5.0" +dbt-snowflake = ">=1.5.0" +dbt-athena-community = ">=1.5.0" +dbt-sqlserver = ">=1.5.0" +# dbt-databricks = {version = ">=1.7.3", optional = true} + [tool.poetry.group.docs.dependencies] SQLAlchemy = ">=1.4.0" pymysql = "^1.1.0" @@ -208,7 +213,7 @@ dbt-duckdb = ">=1.2.0" pymongo = ">=4.3.3" pandas = ">2" alive-progress = ">=3.0.1" -pyarrow = ">=16.0.0" +pyarrow = ">=14.0.0" psycopg2-binary = ">=2.9" [tool.black] # https://black.readthedocs.io/en/stable/usage_and_configuration/the_basics.html#configuration-via-a-file diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index dd687bee83..e7083956b3 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -765,7 +765,7 @@ def test_find_all_keys() -> None: def test_coercion_to_hint_types(environment: Any) -> None: - add_config_dict_to_env(COERCIONS) + add_config_dict_to_env(COERCIONS, destructure_dicts=False) C = CoercionTestConfiguration() resolve._resolve_config_fields( @@ -829,7 +829,7 @@ def test_values_serialization() -> None: def test_invalid_coercions(environment: Any) -> None: C = CoercionTestConfiguration() - add_config_dict_to_env(INVALID_COERCIONS) + add_config_dict_to_env(INVALID_COERCIONS, destructure_dicts=False) for key, value in INVALID_COERCIONS.items(): try: resolve._resolve_config_fields( @@ -851,8 +851,8 @@ def test_invalid_coercions(environment: Any) -> None: def test_excepted_coercions(environment: Any) -> None: C = CoercionTestConfiguration() - add_config_dict_to_env(COERCIONS) - add_config_dict_to_env(EXCEPTED_COERCIONS, overwrite_keys=True) + add_config_dict_to_env(COERCIONS, destructure_dicts=False) + add_config_dict_to_env(EXCEPTED_COERCIONS, overwrite_keys=True, destructure_dicts=False) resolve._resolve_config_fields( C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False ) @@ -1297,6 +1297,15 @@ def test_add_config_to_env(environment: Dict[str, str]) -> None: add_config_to_env(c.sectioned) assert environment == {"DLT_TEST__PASSWORD": "PASS"} + # dicts should be added as sections + environment.clear() + c_s = ConnectionStringCredentials( + "mssql://loader:@loader.database.windows.net/dlt_data?TrustServerCertificate=yes&Encrypt=yes&LongAsMax=yes" + ) + add_config_to_env(c_s, ("dlt",)) + assert environment["DLT__CREDENTIALS__QUERY__ENCRYPT"] == "yes" + assert environment["DLT__CREDENTIALS__QUERY__TRUSTSERVERCERTIFICATE"] == "yes" + def test_configuration_copy() -> None: c = resolve.resolve_configuration( diff --git a/tests/load/pipeline/test_dbt_helper.py b/tests/load/pipeline/test_dbt_helper.py index 6793414e3c..1dc225594f 100644 --- a/tests/load/pipeline/test_dbt_helper.py +++ b/tests/load/pipeline/test_dbt_helper.py @@ -23,7 +23,8 @@ def dbt_venv() -> Iterator[Venv]: # context manager will delete venv at the end # yield Venv.restore_current() - with create_venv(tempfile.mkdtemp(), list(ACTIVE_SQL_DESTINATIONS)) as venv: + # NOTE: we limit the max version of dbt to allow all dbt adapters to run. ie. sqlserver does not work on 1.8 + with create_venv(tempfile.mkdtemp(), list(ACTIVE_SQL_DESTINATIONS), dbt_version="<1.8") as venv: yield venv @@ -70,6 +71,12 @@ def test_run_jaffle_package( ids=lambda x: x.name, ) def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: + if destination_config.destination == "mssql": + pytest.skip( + "mssql requires non standard SQL syntax and we do not have specialized dbt package" + " for it" + ) + from docs.examples.chess.chess import chess # provide chess url via environ @@ -123,6 +130,11 @@ def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_ven def test_run_chess_dbt_to_other_dataset( destination_config: DestinationTestConfiguration, dbt_venv: Venv ) -> None: + if destination_config.destination == "mssql": + pytest.skip( + "mssql requires non standard SQL syntax and we do not have specialized dbt package" + " for it" + ) from docs.examples.chess.chess import chess # provide chess url via environ diff --git a/tests/load/utils.py b/tests/load/utils.py index 5a999dc1b7..8048d9fe51 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -197,8 +197,7 @@ def destinations_configs( destination_configs += [ DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS - if destination - not in ("athena", "mssql", "synapse", "databricks", "dremio", "clickhouse") + if destination not in ("athena", "synapse", "databricks", "dremio", "clickhouse") ] destination_configs += [ DestinationTestConfiguration(destination="duckdb", file_format="parquet") @@ -247,7 +246,7 @@ def destinations_configs( ) ] destination_configs += [ - DestinationTestConfiguration(destination="mssql", supports_dbt=False), + # DestinationTestConfiguration(destination="mssql", supports_dbt=False), DestinationTestConfiguration(destination="synapse", supports_dbt=False), ] From 3efc10b4c787b4a625b16e79827043c6233b4802 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 11 Jun 2024 16:33:40 +0200 Subject: [PATCH 20/61] bumps to pre-release 0.4.13a0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index abc87edaee..36ee683de9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.4.12" +version = "0.4.13a0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] From 23c6365310d60ed86e16b25196ae4e41e71e8b3b Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 13 Jun 2024 00:16:14 +0200 Subject: [PATCH 21/61] Add a docstring to the `IncrementalTransform` class (#1459) --- dlt/extract/incremental/transform.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index d117b4f1d8..2fa2bb7163 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -34,6 +34,15 @@ class IncrementalTransform: + """A base class for handling extraction and stateful tracking + of incremental data from input data items. + + By default, the descendant classes are instantiated within the + `dlt.extract.incremental.Incremental` class. + + Subclasses must implement the `__call__` method which will be called + for each data item in the extracted data. + """ def __init__( self, resource_name: str, @@ -100,6 +109,7 @@ def deduplication_disabled(self) -> bool: class JsonIncremental(IncrementalTransform): + """Extracts incremental data from JSON data items.""" def find_cursor_value(self, row: TDataItem) -> Any: """Finds value in row at cursor defined by self.cursor_path. From 58501ab1b679bf3ded7334608d7bb2ef631bfe2e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Tue, 18 Jun 2024 00:21:19 +0400 Subject: [PATCH 22/61] fix: service principal auth support for synapse copy job (#1472) * add service principal auth support for synapse copy into job * remove unused mypy ignore statements * add blank lines * re-add mypy ignore statements * make synapse test conditional on active destinations --- dlt/destinations/impl/synapse/synapse.py | 40 +++++++++++++++------ dlt/extract/incremental/transform.py | 2 ++ tests/load/pipeline/test_synapse.py | 44 ++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 11 deletions(-) create mode 100644 tests/load/pipeline/test_synapse.py diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index f52b64b9d9..48171ace4c 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -1,11 +1,9 @@ import os -from typing import ClassVar, Sequence, List, Dict, Any, Optional, cast +from typing import ClassVar, Sequence, List, Dict, Any, Optional, cast, Union from copy import deepcopy from textwrap import dedent from urllib.parse import urlparse, urlunparse -from dlt import current - from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( SupportsStagingDestination, @@ -16,13 +14,15 @@ from dlt.common.schema.utils import ( table_schema_has_type, get_inherited_table_hint, - is_complete_column, ) -from dlt.common.configuration.specs import AzureCredentialsWithoutDefaults +from dlt.common.configuration.exceptions import ConfigurationException +from dlt.common.configuration.specs import ( + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentialsWithoutDefaults, +) from dlt.destinations.job_impl import NewReferenceJob -from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase, LoadJob, CopyRemoteFileLoadJob from dlt.destinations.exceptions import LoadJobTerminalException @@ -163,7 +163,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> table, file_path, self.sql_client, - cast(AzureCredentialsWithoutDefaults, self.config.staging_config.credentials), + self.config.staging_config.credentials, # type: ignore[arg-type] self.config.staging_use_msi, ) return job @@ -175,7 +175,9 @@ def __init__( table: TTableSchema, file_path: str, sql_client: SqlClientBase[Any], - staging_credentials: Optional[AzureCredentialsWithoutDefaults] = None, + staging_credentials: Optional[ + Union[AzureCredentialsWithoutDefaults, AzureServicePrincipalCredentialsWithoutDefaults] + ] = None, staging_use_msi: bool = False, ) -> None: self.staging_use_msi = staging_use_msi @@ -204,7 +206,10 @@ def execute(self, table: TTableSchema, bucket_path: str) -> None: staging_credentials = self._staging_credentials assert staging_credentials is not None - assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + assert isinstance( + staging_credentials, + (AzureCredentialsWithoutDefaults, AzureServicePrincipalCredentialsWithoutDefaults), + ) azure_storage_account_name = staging_credentials.azure_storage_account_name https_path = self._get_https_path(bucket_path, azure_storage_account_name) table_name = table["name"] @@ -212,8 +217,21 @@ def execute(self, table: TTableSchema, bucket_path: str) -> None: if self.staging_use_msi: credential = "IDENTITY = 'Managed Identity'" else: - sas_token = staging_credentials.azure_storage_sas_token - credential = f"IDENTITY = 'Shared Access Signature', SECRET = '{sas_token}'" + # re-use staging credentials for copy into Synapse + if isinstance(staging_credentials, AzureCredentialsWithoutDefaults): + sas_token = staging_credentials.azure_storage_sas_token + credential = f"IDENTITY = 'Shared Access Signature', SECRET = '{sas_token}'" + elif isinstance(staging_credentials, AzureServicePrincipalCredentialsWithoutDefaults): + tenant_id = staging_credentials.azure_tenant_id + endpoint = f"https://login.microsoftonline.com/{tenant_id}/oauth2/token" + identity = f"{staging_credentials.azure_client_id}@{endpoint}" + secret = staging_credentials.azure_client_secret + credential = f"IDENTITY = '{identity}', SECRET = '{secret}'" + else: + raise ConfigurationException( + f"Credentials of type `{type(staging_credentials)}` not supported" + " when loading data from staging into Synapse using `COPY INTO`." + ) # Copy data from staging file into Synapse table. with self._sql_client.begin_transaction(): diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 2fa2bb7163..8b4cae4090 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -43,6 +43,7 @@ class IncrementalTransform: Subclasses must implement the `__call__` method which will be called for each data item in the extracted data. """ + def __init__( self, resource_name: str, @@ -110,6 +111,7 @@ def deduplication_disabled(self) -> bool: class JsonIncremental(IncrementalTransform): """Extracts incremental data from JSON data items.""" + def find_cursor_value(self, row: TDataItem) -> Any: """Finds value in row at cursor defined by self.cursor_path. diff --git a/tests/load/pipeline/test_synapse.py b/tests/load/pipeline/test_synapse.py new file mode 100644 index 0000000000..dc1e6f9d7b --- /dev/null +++ b/tests/load/pipeline/test_synapse.py @@ -0,0 +1,44 @@ +from typing import Dict, Any, Union + +import pytest + +import dlt +from dlt.destinations import filesystem, synapse +from dlt.common.configuration.specs.azure_credentials import ( + AzureCredentialsWithoutDefaults, + AzureServicePrincipalCredentialsWithoutDefaults, +) + +from tests.utils import skip_if_not_active +from tests.pipeline.utils import assert_load_info +from tests.load.utils import AZ_BUCKET + + +skip_if_not_active("synapse") + + +@pytest.mark.parametrize("credentials_type", ("sas", "service_principal", "managed_identity")) +def test_copy_file_load_job_credentials(credentials_type: str) -> None: + staging_creds: Union[ + AzureCredentialsWithoutDefaults, AzureServicePrincipalCredentialsWithoutDefaults + ] + if credentials_type == "service_principal": + staging_creds = AzureServicePrincipalCredentialsWithoutDefaults( + **dlt.secrets.get("destination.fsazureprincipal.credentials") + ) + else: + FS_CREDS: Dict[str, Any] = dlt.secrets.get("destination.filesystem.credentials") + staging_creds = AzureCredentialsWithoutDefaults( + azure_storage_account_name=FS_CREDS["azure_storage_account_name"], + azure_storage_account_key=FS_CREDS["azure_storage_account_key"], + ) + + pipeline = dlt.pipeline( + staging=filesystem(bucket_url=AZ_BUCKET, credentials=staging_creds), + destination=synapse( + staging_use_msi=(True if credentials_type == "managed_identity" else False) + ), + ) + + info = pipeline.run([{"foo": "bar"}], table_name="abstract") + assert_load_info(info) From 1959942c4313ff28d2d077bb179094dd2bb531c6 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 18 Jun 2024 13:54:47 +0200 Subject: [PATCH 23/61] Loader parallelism strategies (#1457) * add support for parallelism strategies and max worker overriding from the destination * add direct support for custom destination * add tests * clean up print statement * make concurrency test more lenient * review fixes: * remove underscore from settingsvalue * simplify job selection function * let config always override destination settings * add info to docs * sort jobs before grouping --- dlt/common/destination/capabilities.py | 12 +- dlt/destinations/decorators.py | 8 +- dlt/destinations/impl/destination/__init__.py | 5 + dlt/destinations/impl/destination/factory.py | 2 + dlt/load/configuration.py | 10 +- dlt/load/load.py | 11 +- dlt/load/utils.py | 44 ++++++- .../dlt-ecosystem/destinations/destination.md | 6 +- tests/load/test_parallelism.py | 115 ++++++++++++++++++ tests/load/test_parallelism_util.py | 93 ++++++++++++++ 10 files changed, 297 insertions(+), 9 deletions(-) create mode 100644 tests/load/test_parallelism.py create mode 100644 tests/load/test_parallelism_util.py diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index e5ceb859f1..d8361d7140 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -30,6 +30,8 @@ # insert_values - insert SQL statements # sql - any sql statement TLoaderFileFormat = Literal["jsonl", "typed-jsonl", "insert_values", "parquet", "csv"] +TLoaderParallelismStrategy = Literal["parallel", "table-sequential", "sequential"] + ALL_SUPPORTED_FILE_FORMATS: Set[TLoaderFileFormat] = set(get_args(TLoaderFileFormat)) @@ -81,13 +83,19 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): insert_values_writer_type: str = "default" supports_multiple_statements: bool = True supports_clone_table: bool = False - """Destination supports CREATE TABLE ... CLONE ... statements""" - max_table_nesting: Optional[int] = None # destination can overwrite max table nesting + + max_table_nesting: Optional[int] = None + """Allows a destination to overwrite max_table_nesting from source""" # do not allow to create default value, destination caps must be always explicitly inserted into container can_create_default: ClassVar[bool] = False + max_parallel_load_jobs: Optional[int] = None + """The destination can set the maxium amount of parallel load jobs being executed""" + loader_parallelism_strategy: Optional[TLoaderParallelismStrategy] = None + """The destination can override the parallelism strategy""" + @staticmethod def generic_capabilities( preferred_loader_file_format: TLoaderFileFormat = None, diff --git a/dlt/destinations/decorators.py b/dlt/destinations/decorators.py index 8e0b5d5ee8..c202a37e28 100644 --- a/dlt/destinations/decorators.py +++ b/dlt/destinations/decorators.py @@ -10,6 +10,7 @@ from dlt.common.destination import TLoaderFileFormat from dlt.common.typing import TDataItems from dlt.common.schema import TTableSchema +from dlt.common.destination.capabilities import TLoaderParallelismStrategy from dlt.destinations.impl.destination.factory import destination as _destination from dlt.destinations.impl.destination.configuration import ( @@ -28,6 +29,8 @@ def destination( skip_dlt_columns_and_tables: bool = True, max_table_nesting: int = 0, spec: Type[CustomDestinationClientConfiguration] = None, + max_parallel_load_jobs: Optional[int] = None, + loader_parallelism_strategy: Optional[TLoaderParallelismStrategy] = None, ) -> Callable[ [Callable[Concatenate[Union[TDataItems, str], TTableSchema, TDestinationCallableParams], Any]], Callable[TDestinationCallableParams, _destination], @@ -55,7 +58,8 @@ def destination( max_nesting_level: defines how deep the normalizer will go to normalize complex fields on your data to create subtables. This overwrites any settings on your source and is set to zero to not create any nested tables by default. skip_dlt_columns_and_tables: defines wether internal tables and columns will be fed into the custom destination function. This is set to True by default. spec: defines a configuration spec that will be used to to inject arguments into the decorated functions. Argument not in spec will not be injected - + max_parallel_load_jobs: how many load jobs at most will be running during the load + loader_parallelism_strategy: Can be "sequential" which equals max_parallel_load_jobs=1, "table-sequential" where each table will have at most one loadjob at any given time and "parallel" Returns: A callable that can be used to create a dlt custom destination instance """ @@ -83,6 +87,8 @@ def wrapper( naming_convention=naming_convention, skip_dlt_columns_and_tables=skip_dlt_columns_and_tables, max_table_nesting=max_table_nesting, + max_parallel_load_jobs=max_parallel_load_jobs, + loader_parallelism_strategy=loader_parallelism_strategy, **kwargs, # type: ignore ) diff --git a/dlt/destinations/impl/destination/__init__.py b/dlt/destinations/impl/destination/__init__.py index f985119f26..5b076df4c6 100644 --- a/dlt/destinations/impl/destination/__init__.py +++ b/dlt/destinations/impl/destination/__init__.py @@ -1,11 +1,14 @@ from typing import Optional from dlt.common.destination import DestinationCapabilitiesContext, TLoaderFileFormat +from dlt.common.destination.capabilities import TLoaderParallelismStrategy def capabilities( preferred_loader_file_format: TLoaderFileFormat = "typed-jsonl", naming_convention: str = "direct", max_table_nesting: Optional[int] = 0, + max_parallel_load_jobs: Optional[int] = 0, + loader_parallelism_strategy: Optional[TLoaderParallelismStrategy] = None, ) -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext.generic_capabilities(preferred_loader_file_format) caps.supported_loader_file_formats = ["typed-jsonl", "parquet"] @@ -13,4 +16,6 @@ def capabilities( caps.supports_transactions = False caps.naming_convention = naming_convention caps.max_table_nesting = max_table_nesting + caps.max_parallel_load_jobs = max_parallel_load_jobs + caps.loader_parallelism_strategy = loader_parallelism_strategy return caps diff --git a/dlt/destinations/impl/destination/factory.py b/dlt/destinations/impl/destination/factory.py index 3ae6f2e876..b3127ab99b 100644 --- a/dlt/destinations/impl/destination/factory.py +++ b/dlt/destinations/impl/destination/factory.py @@ -41,6 +41,8 @@ def capabilities(self) -> DestinationCapabilitiesContext: ), naming_convention=self.config_params.get("naming_convention", "direct"), max_table_nesting=self.config_params.get("max_table_nesting", None), + max_parallel_load_jobs=self.config_params.get("max_parallel_load_jobs", None), + loader_parallelism_strategy=self.config_params.get("loader_parallelism_strategy", None), ) @property diff --git a/dlt/load/configuration.py b/dlt/load/configuration.py index b3fc2fbcd4..8abc679ea2 100644 --- a/dlt/load/configuration.py +++ b/dlt/load/configuration.py @@ -1,14 +1,18 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Optional from dlt.common.configuration import configspec from dlt.common.storages import LoadStorageConfiguration from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType +TLoaderParallelismStrategy = Literal["parallel", "table-sequential", "sequential"] + @configspec class LoaderConfiguration(PoolRunnerConfiguration): workers: int = 20 """how many parallel loads can be executed""" + parallelism_strategy: Optional[TLoaderParallelismStrategy] = None + """Which parallelism strategy to use at load time""" pool_type: TPoolType = "thread" # mostly i/o (upload) so may be thread pool raise_on_failed_jobs: bool = False """when True, raises on terminally failed jobs immediately""" @@ -20,4 +24,6 @@ class LoaderConfiguration(PoolRunnerConfiguration): truncate_staging_dataset: bool = False def on_resolved(self) -> None: - self.pool_type = "none" if self.workers == 1 else "thread" + self.pool_type = ( + "none" if (self.workers == 1 or self.parallelism_strategy == "sequential") else "thread" + ) diff --git a/dlt/load/load.py b/dlt/load/load.py index 8c7eb431e8..abbeee5ddf 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -53,7 +53,12 @@ LoadClientUnsupportedWriteDisposition, LoadClientUnsupportedFileFormats, ) -from dlt.load.utils import _extend_tables_with_table_chain, get_completed_table_chain, init_client +from dlt.load.utils import ( + _extend_tables_with_table_chain, + get_completed_table_chain, + init_client, + filter_new_jobs, +) class Load(Runnable[Executor], WithStepInfo[LoadMetrics, LoadInfo]): @@ -191,7 +196,9 @@ def w_spool_job( def spool_new_jobs(self, load_id: str, schema: Schema) -> Tuple[int, List[LoadJob]]: # use thread based pool as jobs processing is mostly I/O and we do not want to pickle jobs - load_files = self.load_storage.list_new_jobs(load_id)[: self.config.workers] + load_files = filter_new_jobs( + self.load_storage.list_new_jobs(load_id), self.capabilities, self.config + ) file_count = len(load_files) if file_count == 0: logger.info(f"No new jobs found in {load_id}") diff --git a/dlt/load/utils.py b/dlt/load/utils.py index 5126cbd11e..4e5099855b 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -1,4 +1,5 @@ -from typing import List, Set, Iterable, Callable, Optional +from typing import List, Set, Iterable, Callable, Optional, Sequence +from itertools import groupby from dlt.common import logger from dlt.common.storages.load_package import LoadJobInfo, PackageStorage @@ -15,6 +16,8 @@ JobClientBase, WithStagingDataset, ) +from dlt.load.configuration import LoaderConfiguration +from dlt.common.destination import DestinationCapabilitiesContext def get_completed_table_chain( @@ -216,3 +219,42 @@ def _extend_tables_with_table_chain( continue result.add(chain_table_name) return result + + +def filter_new_jobs( + file_names: Sequence[str], + capabilities: DestinationCapabilitiesContext, + config: LoaderConfiguration, +) -> Sequence[str]: + """Filters the list of new jobs to adhere to max_workers and parallellism strategy""" + """NOTE: in the current setup we only filter based on settings for the final destination""" + """Support for differentiating staging destination jobs might come in the future if we need it""" + + # nothing to do + if not file_names: + return file_names + + # config can overwrite destination settings, if nothing is set, code below defaults to parallel + parallelism_strategy = config.parallelism_strategy or capabilities.loader_parallelism_strategy + + # find real max workers value + max_workers = 1 if parallelism_strategy == "sequential" else config.workers + if mp := capabilities.max_parallel_load_jobs: + max_workers = min(max_workers, mp) + + # regular sequential works on all jobs + eligible_jobs = file_names + + # we must ensure there only is one job per table + if parallelism_strategy == "table-sequential": + eligible_jobs = sorted( + eligible_jobs, key=lambda j: ParsedLoadJobFileName.parse(j).table_name + ) + eligible_jobs = [ + next(table_jobs) + for _, table_jobs in groupby( + eligible_jobs, lambda j: ParsedLoadJobFileName.parse(j).table_name + ) + ] + + return eligible_jobs[:max_workers] diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md index c9a0bff022..6ffc13ad74 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/destination.md +++ b/docs/website/docs/dlt-ecosystem/destinations/destination.md @@ -55,7 +55,9 @@ The full signature of the destination decorator plus its function is the followi name="my_custom_destination", naming_convention="direct", max_table_nesting=0, - skip_dlt_columns_and_tables=True + skip_dlt_columns_and_tables=True, + max_parallel_load_jobs=5, + loader_parallelism_strategy="table-sequential", ) def my_destination(items: TDataItems, table: TTableSchema) -> None: ... @@ -68,6 +70,8 @@ def my_destination(items: TDataItems, table: TTableSchema) -> None: * The `naming_convention` parameter on the destination decorator defines the name of the destination that gets created by the destination decorator. This controls how table and column names are normalized. The default is `direct`, which will keep all names the same. * The `max_nesting_level` parameter on the destination decorator defines how deep the normalizer will go to normalize complex fields on your data to create subtables. This overwrites any settings on your `source` and is set to zero to not create any nested tables by default. * The `skip_dlt_columns_and_tables` parameter on the destination decorator defines whether internal tables and columns will be fed into the custom destination function. This is set to `True` by default. +* The `max_parallel_load_jobs` parameter will define how many load jobs will run in parallel in threads, if you have a destination that only allows five connections at a time, you can set this value to 5 for example +* The `loader_parallelism_strategy` parameter will control how load jobs are parallelized. Set to `parallel`, the default, jobs will be parallelized no matter which table is being loaded to. `table-sequential` will parallelize loading but only ever have one load job per table at a time, `sequential` will run all load jobs sequentially on the main thread. :::note Settings above make sure that shape of the data you receive in the destination function is as close as possible to what you see in the data source. diff --git a/tests/load/test_parallelism.py b/tests/load/test_parallelism.py new file mode 100644 index 0000000000..a1a09a4d6b --- /dev/null +++ b/tests/load/test_parallelism.py @@ -0,0 +1,115 @@ +""" +Actual parallelism test with the help of custom destination +""" +import os +import dlt +import time +from typing import Dict, Tuple + +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema +from dlt.common.destination.capabilities import TLoaderParallelismStrategy + + +def run_pipeline( + items_per_table: int, + max_parallel_load_jobs: int = None, + loader_parallelism_strategy: TLoaderParallelismStrategy = None, +) -> Tuple[int, Dict[str, int]]: + """here we create a pipeline and count how many jobs run in parallel overall and per table depending on the settings""" + + # create one job per item + os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "1" + + current_executing: int = 0 + max_current_executing: int = 0 + + current_executing_per_table: Dict[str, int] = {} + max_current_executing_per_table: Dict[str, int] = {} + + @dlt.destination( + max_parallel_load_jobs=max_parallel_load_jobs, + loader_parallelism_strategy=loader_parallelism_strategy, + ) + def test_sink(items: TDataItems, table: TTableSchema) -> None: + nonlocal current_executing, max_current_executing, current_executing_per_table, max_current_executing_per_table + table_name = table["name"] + # remember the amount of concurrent executions + current_executing += 1 + max_current_executing = max(max_current_executing, current_executing) + # table + current_executing_per_table.setdefault(table_name, 0) + max_current_executing_per_table.setdefault(table_name, 0) + current_executing_per_table[table_name] += 1 + max_current_executing_per_table[table_name] = max( + max_current_executing_per_table[table_name], current_executing_per_table[table_name] + ) + # NOTE: this approach might make the test flaky again, let's see + time.sleep(0.5) + current_executing -= 1 + current_executing_per_table[table_name] -= 1 + + def t() -> TDataItems: + nonlocal items_per_table + for i in range(items_per_table): + yield {"num": i} + + # we load n items for 3 tables in one run + p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + p.run( + [ + dlt.resource(table_name="t1")(t), + dlt.resource(table_name="t2")(t), + dlt.resource(table_name="t3")(t), + ] + ) + + return max_current_executing, max_current_executing_per_table + + +def test_max_concurrent() -> None: + # default is 20, so result is lower than that + max_concurrent, _ = run_pipeline(10) + assert max_concurrent <= 20 and max_concurrent >= 18 + + # lower it + max_concurrent, _ = run_pipeline(5, max_parallel_load_jobs=5) + assert max_concurrent <= 5 and max_concurrent >= 3 + + # sequential strategy will make it go to 1 + max_concurrent, _ = run_pipeline( + 2, max_parallel_load_jobs=5, loader_parallelism_strategy="sequential" + ) + assert max_concurrent == 1 + + +def test_loading_strategy() -> None: + max_concurrent, max_concurrent_per_table = run_pipeline( + 10, max_parallel_load_jobs=20, loader_parallelism_strategy="parallel" + ) + # this includes multiple jobs per table being run + assert max_concurrent <= 20 and max_concurrent >= 18 + assert max_concurrent_per_table["t1"] > 2 + + # this strategy only allows one job per table max + max_concurrent, max_concurrent_per_table = run_pipeline( + 3, loader_parallelism_strategy="table-sequential" + ) + # we still have concurrent jobs but only one per table max + assert max_concurrent <= 3 and max_concurrent >= 2 + assert max_concurrent_per_table == { + "t1": 1, + "t2": 1, + "t3": 1, + } + + # sequential strategy will make it go to 1 + max_concurrent, _ = run_pipeline( + 2, max_parallel_load_jobs=5, loader_parallelism_strategy="sequential" + ) + assert max_concurrent == 1 + assert max_concurrent_per_table == { + "t1": 1, + "t2": 1, + "t3": 1, + } diff --git a/tests/load/test_parallelism_util.py b/tests/load/test_parallelism_util.py new file mode 100644 index 0000000000..b8f43d0743 --- /dev/null +++ b/tests/load/test_parallelism_util.py @@ -0,0 +1,93 @@ +""" +Tests to test the parallelism settings on the loader +NOTE: there are tests in custom destination to check parallelism settings are applied +""" + +from typing import Tuple + +from dlt.load.utils import filter_new_jobs +from dlt.load.configuration import LoaderConfiguration +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.utils import uniq_id +from dlt.common.storages.load_storage import ParsedLoadJobFileName + + +def create_job_name(table: str, index: int) -> str: + uid = uniq_id() + return f"{table}.{uid}.{index}.jsonl" + + +def get_caps_conf() -> Tuple[DestinationCapabilitiesContext, LoaderConfiguration]: + return DestinationCapabilitiesContext(), LoaderConfiguration() + + +def test_max_workers() -> None: + job_names = [create_job_name("t1", i) for i in range(100)] + caps, conf = get_caps_conf() + + # default is 20 + assert len(filter_new_jobs(job_names, caps, conf)) == 20 + + # we can change it + conf.workers = 35 + assert len(filter_new_jobs(job_names, caps, conf)) == 35 + + # destination may override this + caps.max_parallel_load_jobs = 15 + assert len(filter_new_jobs(job_names, caps, conf)) == 15 + + # lowest value will prevail + conf.workers = 5 + assert len(filter_new_jobs(job_names, caps, conf)) == 5 + + +def test_table_sequential_parallelism_strategy() -> None: + # we create 10 jobs for 8 different tables + job_names = [] + for y in range(8): + job_names += [create_job_name(f"t{y}", i) for i in range(10)] + assert len(job_names) == 80 + assert len({ParsedLoadJobFileName.parse(j).table_name for j in job_names}) == 8 + caps, conf = get_caps_conf() + + # default is 20 + assert len(filter_new_jobs(job_names, caps, conf)) == 20 + + # table sequential will give us 8, one for each table + conf.parallelism_strategy = "table-sequential" + filtered = filter_new_jobs(job_names, caps, conf) + assert len(filtered) == 8 + assert len({ParsedLoadJobFileName.parse(j).table_name for j in job_names}) == 8 + + # max workers also are still applied + conf.workers = 3 + assert len(filter_new_jobs(job_names, caps, conf)) == 3 + + +def test_strategy_preference() -> None: + # we create 10 jobs for 8 different tables + job_names = [] + for y in range(8): + job_names += [create_job_name(f"t{y}", i) for i in range(10)] + caps, conf = get_caps_conf() + + # nothing set will default to parallel + assert len(filter_new_jobs(job_names, caps, conf)) == 20 + + caps.loader_parallelism_strategy = "table-sequential" + assert len(filter_new_jobs(job_names, caps, conf)) == 8 + + caps.loader_parallelism_strategy = "sequential" + assert len(filter_new_jobs(job_names, caps, conf)) == 1 + + # config may override (will go back to default 20) + conf.parallelism_strategy = "parallel" + assert len(filter_new_jobs(job_names, caps, conf)) == 20 + + conf.parallelism_strategy = "table-sequential" + assert len(filter_new_jobs(job_names, caps, conf)) == 8 + + +def test_no_input() -> None: + caps, conf = get_caps_conf() + assert filter_new_jobs([], caps, conf) == [] From b135b8190c9d71e267bee465cfc77374a966007d Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 18 Jun 2024 17:40:38 +0200 Subject: [PATCH 24/61] Migrate to sentry sdk 2.0 (#1477) * update sentry dependency * migrate to sentry 2.0 * implement current scope span accessor --- dlt/common/runtime/sentry.py | 2 +- dlt/pipeline/track.py | 26 ++-- poetry.lock | 226 +++----------------------------- pyproject.toml | 3 +- tests/pipeline/test_pipeline.py | 4 +- 5 files changed, 32 insertions(+), 229 deletions(-) diff --git a/dlt/common/runtime/sentry.py b/dlt/common/runtime/sentry.py index 7ea45affc0..835a4d6446 100644 --- a/dlt/common/runtime/sentry.py +++ b/dlt/common/runtime/sentry.py @@ -28,7 +28,7 @@ def init_sentry(config: RunConfiguration) -> None: # https://docs.sentry.io/platforms/python/guides/logging/ sentry_sdk.init( config.sentry_dsn, - before_send=before_send, + before_send=before_send, # type: ignore traces_sample_rate=1.0, # disable tornado, boto3, sql alchemy etc. auto_enabling_integrations=False, diff --git a/dlt/pipeline/track.py b/dlt/pipeline/track.py index 2300eef275..f855de6dfd 100644 --- a/dlt/pipeline/track.py +++ b/dlt/pipeline/track.py @@ -1,6 +1,6 @@ """Implements SupportsTracking""" import contextlib -from typing import Any +from typing import Any, List import humanize from dlt.common import logger @@ -15,8 +15,8 @@ from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace try: - from sentry_sdk import Hub - from sentry_sdk.tracing import Span + from sentry_sdk import Scope + from sentry_sdk.tracing import Span, Transaction def _add_sentry_tags(span: Span, pipeline: SupportsPipeline) -> None: span.set_tag("pipeline_name", pipeline.pipeline_name) @@ -61,11 +61,11 @@ def _get_step_elapsed(step: PipelineStepTrace) -> str: def on_start_trace(trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline) -> None: if pipeline.runtime_config.sentry_dsn: - # https://getsentry.github.io/sentry-python/api.html#sentry_sdk.Hub.capture_event - # print(f"START SENTRY TX: {trace.transaction_id} SCOPE: {Hub.current.scope}") - transaction = Hub.current.start_transaction(name=step, op=step) - _add_sentry_tags(transaction, pipeline) - transaction.__enter__() + # print(f"START SENTRY TX: {trace.transaction_id} SCOPE: {Hub.current.scope}" + transaction = Scope.get_current_scope().start_transaction(name=step, op=step) + if isinstance(transaction, Transaction): + _add_sentry_tags(transaction, pipeline) + transaction.__enter__() def on_start_trace_step( @@ -73,9 +73,9 @@ def on_start_trace_step( ) -> None: if pipeline.runtime_config.sentry_dsn: # print(f"START SENTRY SPAN {trace.transaction_id}:{trace_step.span_id} SCOPE: {Hub.current.scope}") - span = Hub.current.scope.span.start_child(description=step, op=step).__enter__() - span.op = step + span = Scope.get_current_scope().start_span(description=step, op=step) _add_sentry_tags(span, pipeline) + span.__enter__() def on_end_trace_step( @@ -87,8 +87,7 @@ def on_end_trace_step( ) -> None: if pipeline.runtime_config.sentry_dsn: # print(f"---END SENTRY SPAN {trace.transaction_id}:{step.span_id}: {step} SCOPE: {Hub.current.scope}") - with contextlib.suppress(Exception): - Hub.current.scope.span.__exit__(None, None, None) + Scope.get_current_scope().span.__exit__(None, None, None) # disable automatic slack messaging until we can configure messages themselves # if step.step == "load": # if pipeline.runtime_config.slack_incoming_hook and step.step_exception is None: @@ -117,5 +116,4 @@ def on_end_trace_step( def on_end_trace(trace: PipelineTrace, pipeline: SupportsPipeline, send_state: bool) -> None: if pipeline.runtime_config.sentry_dsn: # print(f"---END SENTRY TX: {trace.transaction_id} SCOPE: {Hub.current.scope}") - with contextlib.suppress(Exception): - Hub.current.scope.span.__exit__(None, None, None) + Scope.get_current_scope().transaction.__exit__(None, None, None) diff --git a/poetry.lock b/poetry.lock index 31c9fd08ce..f6a6f98c1a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "about-time" @@ -3521,164 +3521,6 @@ files = [ {file = "google_re2-1.1-1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c6c9f64b9724ec38da8e514f404ac64e9a6a5e8b1d7031c2dadd05c1f4c16fd"}, {file = "google_re2-1.1-1-cp39-cp39-win32.whl", hash = "sha256:d1b751b9ab9f8e2ab2a36d72b909281ce65f328c9115a1685acae1a2d1afd7a4"}, {file = "google_re2-1.1-1-cp39-cp39-win_amd64.whl", hash = "sha256:ac775c75cec7069351d201da4e0fb0cae4c1c5ebecd08fa34e1be89740c1d80b"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5eaefe4705b75ca5f78178a50104b689e9282f868e12f119b26b4cffc0c7ee6e"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:e35f2c8aabfaaa4ce6420b3cae86c0c29042b1b4f9937254347e9b985694a171"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:35fd189cbaaaa39c9a6a8a00164c8d9c709bacd0c231c694936879609beff516"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:60475d222cebd066c80414831c8a42aa2449aab252084102ee05440896586e6a"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:871cb85b9b0e1784c983b5c148156b3c5314cb29ca70432dff0d163c5c08d7e5"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94f4e66e34bdb8de91ec6cdf20ba4fa9fea1dfdcfb77ff1f59700d01a0243664"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1563577e2b720d267c4cffacc0f6a2b5c8480ea966ebdb1844fbea6602c7496f"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49b7964532a801b96062d78c0222d155873968f823a546a3dbe63d73f25bb56f"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2362fd70eb639a75fd0187d28b4ba7b20b3088833d8ad7ffd8693d0ba159e1c2"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86b80719636a4e21391e20a9adf18173ee6ae2ec956726fe2ff587417b5e8ba6"}, - {file = "google_re2-1.1-2-cp310-cp310-win32.whl", hash = "sha256:5456fba09df951fe8d1714474ed1ecda102a68ddffab0113e6c117d2e64e6f2b"}, - {file = "google_re2-1.1-2-cp310-cp310-win_amd64.whl", hash = "sha256:2ac6936a3a60d8d9de9563e90227b3aea27068f597274ca192c999a12d8baa8f"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d5a87b436028ec9b0f02fe19d4cbc19ef30441085cdfcdf1cce8fbe5c4bd5e9a"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:fc0d4163de9ed2155a77e7a2d59d94c348a6bbab3cff88922fab9e0d3d24faec"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:48b12d953bc796736e7831d67b36892fb6419a4cc44cb16521fe291e594bfe23"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:62c780c927cff98c1538439f0ff616f48a9b2e8837c676f53170d8ae5b9e83cb"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:04b2aefd768aa4edeef8b273327806c9cb0b82e90ff52eacf5d11003ac7a0db2"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9c90175992346519ee7546d9af9a64541c05b6b70346b0ddc54a48aa0d3b6554"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22ad9ad9d125249d6386a2e80efb9de7af8260b703b6be7fa0ab069c1cf56ced"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f70971f6ffe5254e476e71d449089917f50ebf9cf60f9cec80975ab1693777e2"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f267499529e64a4abed24c588f355ebe4700189d434d84a7367725f5a186e48d"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b632eff5e4cd44545a9c0e52f2e1becd55831e25f4dd4e0d7ec8ee6ca50858c1"}, - {file = "google_re2-1.1-2-cp311-cp311-win32.whl", hash = "sha256:a42c733036e8f242ee4e5f0e27153ad4ca44ced9e4ce82f3972938ddee528db0"}, - {file = "google_re2-1.1-2-cp311-cp311-win_amd64.whl", hash = "sha256:64f8eed4ca96905d99b5286b3d14b5ca4f6a025ff3c1351626a7df2f93ad1ddd"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5541efcca5b5faf7e0d882334a04fa479bad4e7433f94870f46272eec0672c4a"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:92309af35b6eb2d3b3dc57045cdd83a76370958ab3e0edd2cc4638f6d23f5b32"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:197cd9bcaba96d18c5bf84d0c32fca7a26c234ea83b1d3083366f4392cb99f78"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:1b896f171d29b541256cf26e10dccc9103ac1894683914ed88828ca6facf8dca"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:e022d3239b945014e916ca7120fee659b246ec26c301f9e0542f1a19b38a8744"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:2c73f8a9440873b68bee1198094377501065e85aaf6fcc0d2512c7589ffa06ca"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:901d86555bd7725506d651afaba7d71cd4abd13260aed6cfd7c641a45f76d4f6"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ce4710ff636701cfb56eb91c19b775d53b03749a23b7d2a5071bbbf4342a9067"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76a20e5ebdf5bc5d430530197e42a2eeb562f729d3a3fb51f39168283d676e66"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77c9f4d4bb1c8de9d2642d3c4b8b615858ba764df025b3b4f1310266f8def269"}, - {file = "google_re2-1.1-2-cp38-cp38-win32.whl", hash = "sha256:94bd60785bf37ef130a1613738e3c39465a67eae3f3be44bb918540d39b68da3"}, - {file = "google_re2-1.1-2-cp38-cp38-win_amd64.whl", hash = "sha256:59efeb77c0dcdbe37794c61f29c5b1f34bc06e8ec309a111ccdd29d380644d70"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:221e38c27e1dd9ccb8e911e9c7aed6439f68ce81e7bb74001076830b0d6e931d"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:d9145879e6c2e1b814445300b31f88a675e1f06c57564670d95a1442e8370c27"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c8a12f0740e2a52826bdbf95569a4b0abdf413b4012fa71e94ad25dd4715c6e5"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:9c9998f71466f4db7bda752aa7c348b2881ff688e361108fe500caad1d8b9cb2"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:0c39f69b702005963a3d3bf78743e1733ad73efd7e6e8465d76e3009e4694ceb"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:6d0ce762dee8d6617d0b1788a9653e805e83a23046c441d0ea65f1e27bf84114"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ecf3619d98c9b4a7844ab52552ad32597cdbc9a5bdbc7e3435391c653600d1e2"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9a1426a8cbd1fa004974574708d496005bd379310c4b1c7012be4bc75efde7a8"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1a30626ba48b4070f3eab272d860ef1952e710b088792c4d68dddb155be6bfc"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b9c1ffcfbc3095b6ff601ec2d2bf662988f6ea6763bc1c9d52bec55881f8fde"}, - {file = "google_re2-1.1-2-cp39-cp39-win32.whl", hash = "sha256:32ecf995a252c0548404c1065ba4b36f1e524f1f4a86b6367a1a6c3da3801e30"}, - {file = "google_re2-1.1-2-cp39-cp39-win_amd64.whl", hash = "sha256:e7865410f3b112a3609739283ec3f4f6f25aae827ff59c6bfdf806fd394d753e"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3b21f83f0a201009c56f06fcc7294a33555ede97130e8a91b3f4cae01aed1d73"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b38194b91354a38db1f86f25d09cdc6ac85d63aee4c67b43da3048ce637adf45"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e7da3da8d6b5a18d6c3b61b11cc5b66b8564eaedce99d2312b15b6487730fc76"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:aeca656fb10d8638f245331aabab59c9e7e051ca974b366dd79e6a9efb12e401"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:2069d6dc94f5fa14a159bf99cad2f11e9c0f8ec3b7f44a4dde9e59afe5d1c786"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:2319a39305a4931cb5251451f2582713418a19bef2af7adf9e2a7a0edd939b99"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb98fc131699756c6d86246f670a5e1c1cc1ba85413c425ad344cb30479b246c"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6e038986d8ffe4e269f8532f03009f229d1f6018d4ac0dabc8aff876338f6e0"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8618343ee658310e0f53bf586fab7409de43ce82bf8d9f7eb119536adc9783fd"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8140ca861cfe00602319cefe2c7b8737b379eb07fb328b51dc44584f47a2718"}, - {file = "google_re2-1.1-3-cp310-cp310-win32.whl", hash = "sha256:41f439c5c54e8a3a0a1fa2dbd1e809d3f643f862df7b16dd790f36a1238a272e"}, - {file = "google_re2-1.1-3-cp310-cp310-win_amd64.whl", hash = "sha256:fe20e97a33176d96d3e4b5b401de35182b9505823abea51425ec011f53ef5e56"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c39ff52b1765db039f690ee5b7b23919d8535aae94db7996079fbde0098c4d7"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5420be674fd164041639ba4c825450f3d4bd635572acdde16b3dcd697f8aa3ef"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ff53881cf1ce040f102a42d39db93c3f835f522337ae9c79839a842f26d97733"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:8d04600b0b53523118df2e413a71417c408f20dee640bf07dfab601c96a18a77"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:c4835d4849faa34a7fa1074098d81c420ed6c0707a3772482b02ce14f2a7c007"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:3309a9b81251d35fee15974d0ae0581a9a375266deeafdc3a3ac0d172a742357"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2b51cafee7e0bc72d0a4a454547bd8f257cde412ac9f1a2dc46a203b5e42cf4"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:83f5f1cb52f832c2297d271ee8c56cf5e9053448162e5d2223d513f729bad908"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55865a1ace92be3f7953b2e2b38b901d8074a367aa491daee43260a53a7fc6f0"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cec2167dd142e583e98c783bd0d28b8cf5a9cdbe1f7407ba4163fe3ccb613cb9"}, - {file = "google_re2-1.1-3-cp311-cp311-win32.whl", hash = "sha256:a0bc1fe96849e4eb8b726d0bba493f5b989372243b32fe20729cace02e5a214d"}, - {file = "google_re2-1.1-3-cp311-cp311-win_amd64.whl", hash = "sha256:e6310a156db96fc5957cb007dd2feb18476898654530683897469447df73a7cd"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8e63cd10ea006088b320e8c5d308da1f6c87aa95138a71c60dd7ca1c8e91927e"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:12b566830a334178733a85e416b1e0507dbc0ceb322827616fe51ef56c5154f1"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:442e18c9d46b225c1496919c16eafe8f8d9bb4091b00b4d3440da03c55bbf4ed"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:c54c00263a9c39b2dacd93e9636319af51e3cf885c080b9680a9631708326460"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:15a3caeeb327bc22e0c9f95eb76890fec8874cacccd2b01ff5c080ab4819bbec"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:59ec0d2cced77f715d41f6eafd901f6b15c11e28ba25fe0effdc1de554d78e75"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:185bf0e3441aed3840590f8e42f916e2920d235eb14df2cbc2049526803d3e71"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:586d3f2014eea5be14d8de53374d9b79fa99689160e00efa64b5fe93af326087"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc2575082de4ffd234d9607f3ae67ca22b15a1a88793240e2045f3b3a36a5795"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:59c5ad438eddb3630def394456091284d7bbc5b89351987f94f3792d296d1f96"}, - {file = "google_re2-1.1-3-cp312-cp312-win32.whl", hash = "sha256:5b9878c53f2bf16f75bf71d4ddd57f6611351408d5821040e91c53ebdf82c373"}, - {file = "google_re2-1.1-3-cp312-cp312-win_amd64.whl", hash = "sha256:4fdecfeb213110d0a85bad335a8e7cdb59fea7de81a4fe659233f487171980f9"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2dd87bacab32b709c28d0145fe75a956b6a39e28f0726d867375dba5721c76c1"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:55d24c61fe35dddc1bb484593a57c9f60f9e66d7f31f091ef9608ed0b6dde79f"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a0cf1180d908622df648c26b0cd09281f92129805ccc56a39227fdbfeab95cb4"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:09586f07f3f88d432265c75976da1c619ab7192cd7ebdf53f4ae0776c19e4b56"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:539f1b053402203576e919a06749198da4ae415931ee28948a1898131ae932ce"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:abf0bcb5365b0e27a5a23f3da403dffdbbac2c0e3a3f1535a8b10cc121b5d5fb"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:19c83e5bbed7958213eeac3aa71c506525ce54faf03e07d0b96cd0a764890511"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3348e77330ff672dc44ec01894fa5d93c409a532b6d688feac55e714e9059920"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:06b63edb57c5ce5a13eabfd71155e346b9477dc8906dec7c580d4f70c16a7e0d"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12fe57ba2914092b83338d61d8def9ebd5a2bd0fd8679eceb5d4c2748105d5c0"}, - {file = "google_re2-1.1-3-cp38-cp38-win32.whl", hash = "sha256:80796e08d24e606e675019fe8de4eb5c94bb765be13c384f2695247d54a6df75"}, - {file = "google_re2-1.1-3-cp38-cp38-win_amd64.whl", hash = "sha256:3c2257dedfe7cc5deb6791e563af9e071a9d414dad89e37ac7ad22f91be171a9"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43a0cd77c87c894f28969ac622f94b2e6d1571261dfdd785026848a25cfdc9b9"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:1038990b77fd66f279bd66a0832b67435ea925e15bb59eafc7b60fdec812b616"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fb5dda6875d18dd45f0f24ebced6d1f7388867c8fb04a235d1deab7ea479ce38"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bb1d164965c6d57a351b421d2f77c051403766a8b75aaa602324ee2451fff77f"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a072ebfa495051d07ffecbf6ce21eb84793568d5c3c678c00ed8ff6b8066ab31"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:4eb66c8398c8a510adc97978d944b3b29c91181237218841ea1a91dc39ec0e54"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f7c8b57b1f559553248d1757b7fa5b2e0cc845666738d155dff1987c2618264e"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9162f6aa4f25453c682eb176f21b8e2f40205be9f667e98a54b3e1ff10d6ee75"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2d65ddf67fd7bf94705626871d463057d3d9a3538d41022f95b9d8f01df36e1"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d140c7b9395b4d1e654127aa1c99bcc603ed01000b7bc7e28c52562f1894ec12"}, - {file = "google_re2-1.1-3-cp39-cp39-win32.whl", hash = "sha256:80c5fc200f64b2d903eeb07b8d6cefc620a872a0240c7caaa9aca05b20f5568f"}, - {file = "google_re2-1.1-3-cp39-cp39-win_amd64.whl", hash = "sha256:9eb6dbcee9b5dc4069bbc0634f2eb039ca524a14bed5868fdf6560aaafcbca06"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0db114d7e1aa96dbcea452a40136d7d747d60cbb61394965774688ef59cccd4e"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:82133958e003a1344e5b7a791b9a9dd7560b5c8f96936dbe16f294604524a633"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:9e74fd441d1f3d917d3303e319f61b82cdbd96b9a5ba919377a6eef1504a1e2b"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:734a2e7a4541c57253b5ebee24f3f3366ba3658bcad01da25fb623c78723471a"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:d88d5eecbc908abe16132456fae13690d0508f3ac5777f320ef95cb6cab9a961"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:b91db80b171ecec435a07977a227757dd487356701a32f556fa6fca5d0a40522"}, - {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b23129887a64bb9948af14c84705273ed1a40054e99433b4acccab4dcf6a226"}, - {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dc1a0cc7cd19261dcaf76763e2499305dbb7e51dc69555167cdb8af98782698"}, - {file = "google_re2-1.1-4-cp310-cp310-win32.whl", hash = "sha256:3b2ab1e2420b5dd9743a2d6bc61b64e5f708563702a75b6db86637837eaeaf2f"}, - {file = "google_re2-1.1-4-cp310-cp310-win_amd64.whl", hash = "sha256:92efca1a7ef83b6df012d432a1cbc71d10ff42200640c0f9a5ff5b343a48e633"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:854818fd4ce79787aca5ba459d6e5abe4ca9be2c684a5b06a7f1757452ca3708"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:4ceef51174b6f653b6659a8fdaa9c38960c5228b44b25be2a3bcd8566827554f"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:ee49087c3db7e6f5238105ab5299c09e9b77516fe8cfb0a37e5f1e813d76ecb8"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:dc2312854bdc01410acc5d935f1906a49cb1f28980341c20a68797ad89d8e178"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0dc0d2e42296fa84a3cb3e1bd667c6969389cd5cdf0786e6b1f911ae2d75375b"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6bf04ced98453b035f84320f348f67578024f44d2997498def149054eb860ae8"}, - {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d6b6ef11dc4ab322fa66c2f3561925f2b5372a879c3ed764d20e939e2fd3e5f"}, - {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0dcde6646fa9a97fd3692b3f6ae7daf7f3277d7500b6c253badeefa11db8956a"}, - {file = "google_re2-1.1-4-cp311-cp311-win32.whl", hash = "sha256:5f4f0229deb057348893574d5b0a96d055abebac6debf29d95b0c0e26524c9f6"}, - {file = "google_re2-1.1-4-cp311-cp311-win_amd64.whl", hash = "sha256:4713ddbe48a18875270b36a462b0eada5e84d6826f8df7edd328d8706b6f9d07"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:40a698300b8faddbb325662973f839489c89b960087060bd389c376828978a04"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:103d2d7ac92ba23911a151fd1fc7035cbf6dc92a7f6aea92270ebceb5cd5acd3"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:51fb7182bccab05e8258a2b6a63dda1a6b4a9e8dfb9b03ec50e50c49c2827dd4"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:65383022abd63d7b620221eba7935132b53244b8b463d8fdce498c93cf58b7b7"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396281fc68a9337157b3ffcd9392c6b7fcb8aab43e5bdab496262a81d56a4ecc"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8198adcfcff1c680e052044124621730fc48d08005f90a75487f5651f1ebfce2"}, - {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81f7bff07c448aec4db9ca453d2126ece8710dbd9278b8bb09642045d3402a96"}, - {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7dacf730fd7d6ec71b11d6404b0b26e230814bfc8e9bb0d3f13bec9b5531f8d"}, - {file = "google_re2-1.1-4-cp312-cp312-win32.whl", hash = "sha256:8c764f62f4b1d89d1ef264853b6dd9fee14a89e9b86a81bc2157fe3531425eb4"}, - {file = "google_re2-1.1-4-cp312-cp312-win_amd64.whl", hash = "sha256:0be2666df4bc5381a5d693585f9bbfefb0bfd3c07530d7e403f181f5de47254a"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:5cb1b63a0bfd8dd65d39d2f3b2e5ae0a06ce4b2ce5818a1d1fc78a786a252673"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e41751ce6b67a95230edd0772226dc94c2952a2909674cd69df9804ed0125307"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:b998cfa2d50bf4c063e777c999a7e8645ec7e5d7baf43ad71b1e2e10bb0300c3"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:226ca3b0c2e970f3fc82001ac89e845ecc7a4bb7c68583e7a76cda70b61251a7"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:9adec1f734ebad7c72e56c85f205a281d8fe9bf6583bc21020157d3f2812ce89"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:9c34f3c64ba566af967d29e11299560e6fdfacd8ca695120a7062b6ed993b179"}, - {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b85385fe293838e0d0b6e19e6c48ba8c6f739ea92ce2e23b718afe7b343363"}, - {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4694daa8a8987cfb568847aa872f9990e930c91a68c892ead876411d4b9012c3"}, - {file = "google_re2-1.1-4-cp38-cp38-win32.whl", hash = "sha256:5e671e9be1668187e2995aac378de574fa40df70bb6f04657af4d30a79274ce0"}, - {file = "google_re2-1.1-4-cp38-cp38-win_amd64.whl", hash = "sha256:f66c164d6049a8299f6dfcfa52d1580576b4b9724d6fcdad2f36f8f5da9304b6"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:25cb17ae0993a48c70596f3a3ef5d659638106401cc8193f51c0d7961b3b3eb7"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:5f101f86d14ca94ca4dcf63cceaa73d351f2be2481fcaa29d9e68eeab0dc2a88"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4e82591e85bf262a6d74cff152867e05fc97867c68ba81d6836ff8b0e7e62365"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:1f61c09b93ffd34b1e2557e5a9565039f935407a5786dbad46f64f1a484166e6"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:12b390ad8c7e74bab068732f774e75e0680dade6469b249a721f3432f90edfc3"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1284343eb31c2e82ed2d8159f33ba6842238a56782c881b07845a6d85613b055"}, - {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c7b38e0daf2c06e4d3163f4c732ab3ad2521aecfed6605b69e4482c612da303"}, - {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, - {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, - {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, ] [[package]] @@ -4545,13 +4387,10 @@ files = [ {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"}, - {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"}, - {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"}, {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"}, - {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"}, {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"}, @@ -4560,7 +4399,6 @@ files = [ {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"}, {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"}, - {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"}, {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"}, @@ -4580,7 +4418,6 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"}, - {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"}, @@ -4590,7 +4427,6 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"}, - {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"}, @@ -4600,7 +4436,6 @@ files = [ {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"}, - {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"}, @@ -4610,7 +4445,6 @@ files = [ {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"}, {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"}, - {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"}, @@ -4621,16 +4455,13 @@ files = [ {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"}, {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"}, - {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"}, - {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"}, - {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"}, {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"}, @@ -4791,16 +4622,6 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -5886,8 +5707,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -6775,7 +6596,6 @@ files = [ {file = "pymongo-4.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8"}, {file = "pymongo-4.6.0-cp312-cp312-win32.whl", hash = "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4"}, {file = "pymongo-4.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330"}, - {file = "pymongo-4.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e24025625bad66895b1bc3ae1647f48f0a92dd014108fb1be404c77f0b69ca67"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651"}, @@ -7216,7 +7036,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -7224,16 +7043,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -7250,7 +7061,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -7258,7 +7068,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -7790,34 +7599,41 @@ files = [ [[package]] name = "sentry-sdk" -version = "1.30.0" +version = "2.5.1" description = "Python client for Sentry (https://sentry.io)" optional = false -python-versions = "*" +python-versions = ">=3.6" files = [ - {file = "sentry-sdk-1.30.0.tar.gz", hash = "sha256:7dc873b87e1faf4d00614afd1058bfa1522942f33daef8a59f90de8ed75cd10c"}, - {file = "sentry_sdk-1.30.0-py2.py3-none-any.whl", hash = "sha256:2e53ad63f96bb9da6570ba2e755c267e529edcf58580a2c0d2a11ef26e1e678b"}, + {file = "sentry_sdk-2.5.1-py2.py3-none-any.whl", hash = "sha256:1f87acdce4a43a523ae5aa21a3fc37522d73ebd9ec04b1dbf01aa3d173852def"}, + {file = "sentry_sdk-2.5.1.tar.gz", hash = "sha256:fbc40a78a8a9c6675133031116144f0d0940376fa6e4e1acd5624c90b0aaf58b"}, ] [package.dependencies] certifi = "*" -urllib3 = {version = ">=1.26.11", markers = "python_version >= \"3.6\""} +urllib3 = ">=1.26.11" [package.extras] aiohttp = ["aiohttp (>=3.5)"] +anthropic = ["anthropic (>=0.16)"] arq = ["arq (>=0.23)"] +asyncpg = ["asyncpg (>=0.23)"] beam = ["apache-beam (>=2.12)"] bottle = ["bottle (>=0.12.13)"] celery = ["celery (>=3)"] +celery-redbeat = ["celery-redbeat (>=2)"] chalice = ["chalice (>=1.16.0)"] +clickhouse-driver = ["clickhouse-driver (>=0.2.0)"] django = ["django (>=1.8)"] falcon = ["falcon (>=1.4)"] fastapi = ["fastapi (>=0.79.0)"] flask = ["blinker (>=1.1)", "flask (>=0.11)", "markupsafe"] -grpcio = ["grpcio (>=1.21.1)"] +grpcio = ["grpcio (>=1.21.1)", "protobuf (>=3.8.0)"] httpx = ["httpx (>=0.16.0)"] huey = ["huey (>=2)"] +huggingface-hub = ["huggingface-hub (>=0.22)"] +langchain = ["langchain (>=0.0.210)"] loguru = ["loguru (>=0.5)"] +openai = ["openai (>=1.0.0)", "tiktoken (>=0.3.0)"] opentelemetry = ["opentelemetry-distro (>=0.35b0)"] opentelemetry-experimental = ["opentelemetry-distro (>=0.40b0,<1.0)", "opentelemetry-instrumentation-aiohttp-client (>=0.40b0,<1.0)", "opentelemetry-instrumentation-django (>=0.40b0,<1.0)", "opentelemetry-instrumentation-fastapi (>=0.40b0,<1.0)", "opentelemetry-instrumentation-flask (>=0.40b0,<1.0)", "opentelemetry-instrumentation-requests (>=0.40b0,<1.0)", "opentelemetry-instrumentation-sqlite3 (>=0.40b0,<1.0)", "opentelemetry-instrumentation-urllib (>=0.40b0,<1.0)"] pure-eval = ["asttokens", "executing", "pure-eval"] @@ -8146,7 +7962,6 @@ files = [ {file = "SQLAlchemy-1.4.49-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:03db81b89fe7ef3857b4a00b63dedd632d6183d4ea5a31c5d8a92e000a41fc71"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:95b9df9afd680b7a3b13b38adf6e3a38995da5e162cc7524ef08e3be4e5ed3e1"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a63e43bf3f668c11bb0444ce6e809c1227b8f067ca1068898f3008a273f52b09"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca46de16650d143a928d10842939dab208e8d8c3a9a8757600cae9b7c579c5cd"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f835c050ebaa4e48b18403bed2c0fda986525896efd76c245bdd4db995e51a4c"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c21b172dfb22e0db303ff6419451f0cac891d2e911bb9fbf8003d717f1bcf91"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-win32.whl", hash = "sha256:5fb1ebdfc8373b5a291485757bd6431de8d7ed42c27439f543c81f6c8febd729"}, @@ -8156,35 +7971,26 @@ files = [ {file = "SQLAlchemy-1.4.49-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5debe7d49b8acf1f3035317e63d9ec8d5e4d904c6e75a2a9246a119f5f2fdf3d"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win32.whl", hash = "sha256:82b08e82da3756765c2e75f327b9bf6b0f043c9c3925fb95fb51e1567fa4ee87"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win_amd64.whl", hash = "sha256:171e04eeb5d1c0d96a544caf982621a1711d078dbc5c96f11d6469169bd003f1"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f23755c384c2969ca2f7667a83f7c5648fcf8b62a3f2bbd883d805454964a800"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8396e896e08e37032e87e7fbf4a15f431aa878c286dc7f79e616c2feacdb366c"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66da9627cfcc43bbdebd47bfe0145bb662041472393c03b7802253993b6b7c90"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-win32.whl", hash = "sha256:9a06e046ffeb8a484279e54bda0a5abfd9675f594a2e38ef3133d7e4d75b6214"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-win_amd64.whl", hash = "sha256:7cf8b90ad84ad3a45098b1c9f56f2b161601e4670827d6b892ea0e884569bd1d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:36e58f8c4fe43984384e3fbe6341ac99b6b4e083de2fe838f0fdb91cebe9e9cb"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b31e67ff419013f99ad6f8fc73ee19ea31585e1e9fe773744c0f3ce58c039c30"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc22807a7e161c0d8f3da34018ab7c97ef6223578fcdd99b1d3e7ed1100a5db"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c14b29d9e1529f99efd550cd04dbb6db6ba5d690abb96d52de2bff4ed518bc95"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c40f3470e084d31247aea228aa1c39bbc0904c2b9ccbf5d3cfa2ea2dac06f26d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win32.whl", hash = "sha256:706bfa02157b97c136547c406f263e4c6274a7b061b3eb9742915dd774bbc264"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win_amd64.whl", hash = "sha256:a7f7b5c07ae5c0cfd24c2db86071fb2a3d947da7bd487e359cc91e67ac1c6d2e"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:4afbbf5ef41ac18e02c8dc1f86c04b22b7a2125f2a030e25bbb4aff31abb224b"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24e300c0c2147484a002b175f4e1361f102e82c345bf263242f0449672a4bccf"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393cd06c3b00b57f5421e2133e088df9cabcececcea180327e43b937b5a7caa5"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:201de072b818f8ad55c80d18d1a788729cccf9be6d9dc3b9d8613b053cd4836d"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653ed6817c710d0c95558232aba799307d14ae084cc9b1f4c389157ec50df5c"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win32.whl", hash = "sha256:647e0b309cb4512b1f1b78471fdaf72921b6fa6e750b9f891e09c6e2f0e5326f"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win_amd64.whl", hash = "sha256:ab73ed1a05ff539afc4a7f8cf371764cdf79768ecb7d2ec691e3ff89abbc541e"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:37ce517c011560d68f1ffb28af65d7e06f873f191eb3a73af5671e9c3fada08a"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1878ce508edea4a879015ab5215546c444233881301e97ca16fe251e89f1c55"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95ab792ca493891d7a45a077e35b418f68435efb3e1706cb8155e20e86a9013c"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e8e608983e6f85d0852ca61f97e521b62e67969e6e640fe6c6b575d4db68557"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf956da45290df6e809ea12c54c02ace7f8ff4d765d6d3dfb3655ee876ce58d"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win32.whl", hash = "sha256:f167c8175ab908ce48bd6550679cc6ea20ae169379e73c7720a28f89e53aa532"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win_amd64.whl", hash = "sha256:45806315aae81a0c202752558f0df52b42d11dd7ba0097bf71e253b4215f34f4"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:b6d0c4b15d65087738a6e22e0ff461b407533ff65a73b818089efc8eb2b3e1de"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a843e34abfd4c797018fd8d00ffffa99fd5184c421f190b6ca99def4087689bd"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738d7321212941ab19ba2acf02a68b8ee64987b248ffa2101630e8fccb549e0d"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c890421651b45a681181301b3497e4d57c0d01dc001e10438a40e9a9c25ee77"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26f280b8f0a8f497bc10573849ad6dc62e671d2468826e5c748d04ed9e670d5"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-win32.whl", hash = "sha256:ec2268de67f73b43320383947e74700e95c6770d0c68c4e615e9897e46296294"}, @@ -9282,4 +9088,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "9644e603fdf7b7ca6d177247950370b86ba1c84849deb7cfd83510086cb2e193" +content-hash = "47136cc3a6247e709dfe04a810df7309d1a2bc7fe838592dd5f58dc39c2407c8" diff --git a/pyproject.toml b/pyproject.toml index 36ee683de9..10e3bf47d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -184,12 +184,11 @@ google-api-python-client = "^2.86.0" optional = true [tool.poetry.group.sentry-sdk.dependencies] -sentry-sdk = "^1.5.6" +sentry-sdk = "^2.0.0" [tool.poetry.group.docs] optional = true - [tool.poetry.group.dbt] optional = true [tool.poetry.group.dbt.dependencies] diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index fd5099af9b..f838f31333 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -641,8 +641,8 @@ def test_sentry_tracing() -> None: @dlt.resource def r_check_sentry(): - assert sentry_sdk.Hub.current.scope.span.op == "extract" - assert sentry_sdk.Hub.current.scope.span.containing_transaction.name == "run" + assert sentry_sdk.Scope.get_current_scope().span.op == "extract" + assert sentry_sdk.Scope.get_current_scope().transaction.name == "run" yield [1, 2, 3] p.run(r_check_sentry) From 14f06e43e8824ecbaf6dcb2fcb1a8d7421ae1144 Mon Sep 17 00:00:00 2001 From: matsmhans1 <107866523+matsmhans1@users.noreply.github.com> Date: Tue, 18 Jun 2024 19:51:56 +0200 Subject: [PATCH 25/61] fix: allow loggeradapter in addition to logger in logcollector (#1483) * fix: allow loggeradapter in addition to logger in logcollector the _log method in LogCollector now checks for either logging.Logger or logging.LoggerAdapter unlike previously were it only allowed the former. * fix: allow loggeradapter in addition to logger in logcollector The _log method in LogCollector now checks for either logging.Logger or logging.LoggerAdapter unlike previously were it only allowed the former. --- dlt/common/runtime/collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/runtime/collector.py b/dlt/common/runtime/collector.py index e478d713b2..e00bca576e 100644 --- a/dlt/common/runtime/collector.py +++ b/dlt/common/runtime/collector.py @@ -227,7 +227,7 @@ def dump_counters(self) -> None: self._log(self.log_level, log_message) def _log(self, log_level: int, log_message: str) -> None: - if isinstance(self.logger, logging.Logger): + if isinstance(self.logger, (logging.Logger, logging.LoggerAdapter)): self.logger.log(log_level, log_message) else: print(log_message, file=self.logger or sys.stdout) From b267c704efb46c37db44a2b2cabd0cd28291c8de Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 18 Jun 2024 17:30:52 -0400 Subject: [PATCH 26/61] Add load_id to arrow tables in extract step instead of normalize (#1449) * Add load_id to arrow tables in extract step instead of normalize * Test arrow load id in extract * Get normalize config without decorator * Normalize load ID column name * Load ID column goes last * adds update_table column order tests --------- Co-authored-by: Marcin Rudolf --- dlt/common/libs/pyarrow.py | 76 +++++++++++++++++-- dlt/extract/extractors.py | 44 ++++++++++- dlt/normalize/items_normalizers.py | 43 ++--------- .../verified-sources/arrow-pandas.md | 5 +- tests/common/schema/test_inference.py | 21 +++++ tests/pipeline/test_arrow_sources.py | 58 ++++++++++++++ 6 files changed, 200 insertions(+), 47 deletions(-) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 28f3ddb598..8a6dc68078 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -226,7 +226,8 @@ def should_normalize_arrow_schema( schema: pyarrow.Schema, columns: TTableSchemaColumns, naming: NamingConvention, -) -> Tuple[bool, Mapping[str, str], Dict[str, str], Dict[str, bool], TTableSchemaColumns]: + add_load_id: bool = False, +) -> Tuple[bool, Mapping[str, str], Dict[str, str], Dict[str, bool], bool, TTableSchemaColumns]: rename_mapping = get_normalized_arrow_fields_mapping(schema, naming) rev_mapping = {v: k for k, v in rename_mapping.items()} nullable_mapping = {k: v.get("nullable", True) for k, v in columns.items()} @@ -238,21 +239,42 @@ def should_normalize_arrow_schema( if norm_name in nullable_mapping and field.nullable != nullable_mapping[norm_name]: nullable_updates[norm_name] = nullable_mapping[norm_name] - dlt_tables = list(map(naming.normalize_table_identifier, ("_dlt_id", "_dlt_load_id"))) + dlt_load_id_col = naming.normalize_table_identifier("_dlt_load_id") + dlt_id_col = naming.normalize_table_identifier("_dlt_id") + dlt_columns = {dlt_load_id_col, dlt_id_col} + + # Do we need to add a load id column? + if add_load_id and dlt_load_id_col in columns: + try: + schema.field(dlt_load_id_col) + needs_load_id = False + except KeyError: + needs_load_id = True + else: + needs_load_id = False # remove all columns that are dlt columns but are not present in arrow schema. we do not want to add such columns # that should happen in the normalizer columns = { name: column for name, column in columns.items() - if name not in dlt_tables or name in rev_mapping + if name not in dlt_columns or name in rev_mapping } # check if nothing to rename skip_normalize = ( - list(rename_mapping.keys()) == list(rename_mapping.values()) == list(columns.keys()) - ) and not nullable_updates - return not skip_normalize, rename_mapping, rev_mapping, nullable_updates, columns + (list(rename_mapping.keys()) == list(rename_mapping.values()) == list(columns.keys())) + and not nullable_updates + and not needs_load_id + ) + return ( + not skip_normalize, + rename_mapping, + rev_mapping, + nullable_updates, + needs_load_id, + columns, + ) def normalize_py_arrow_item( @@ -260,6 +282,7 @@ def normalize_py_arrow_item( columns: TTableSchemaColumns, naming: NamingConvention, caps: DestinationCapabilitiesContext, + load_id: Optional[str] = None, ) -> TAnyArrowItem: """Normalize arrow `item` schema according to the `columns`. @@ -267,10 +290,11 @@ def normalize_py_arrow_item( 2. arrows columns will be reordered according to `columns` 3. empty columns will be inserted if they are missing, types will be generated using `caps` 4. arrow columns with different nullability than corresponding schema columns will be updated + 5. Add `_dlt_load_id` column if it is missing and `load_id` is provided """ schema = item.schema - should_normalize, rename_mapping, rev_mapping, nullable_updates, columns = ( - should_normalize_arrow_schema(schema, columns, naming) + should_normalize, rename_mapping, rev_mapping, nullable_updates, needs_load_id, columns = ( + should_normalize_arrow_schema(schema, columns, naming, load_id is not None) ) if not should_normalize: return item @@ -307,6 +331,18 @@ def normalize_py_arrow_item( new_fields.append(schema.field(idx).with_name(column_name)) new_columns.append(item.column(idx)) + if needs_load_id and load_id: + # Storage efficient type for a column with constant value + load_id_type = pyarrow.dictionary(pyarrow.int8(), pyarrow.string()) + new_fields.append( + pyarrow.field( + naming.normalize_table_identifier("_dlt_load_id"), + load_id_type, + nullable=False, + ) + ) + new_columns.append(pyarrow.array([load_id] * item.num_rows, type=load_id_type)) + # create desired type return item.__class__.from_arrays(new_columns, schema=pyarrow.schema(new_fields)) @@ -383,6 +419,30 @@ def from_arrow_scalar(arrow_value: pyarrow.Scalar) -> Any: """Sequence of tuples: (field index, field, generating function)""" +def add_constant_column( + item: TAnyArrowItem, + name: str, + data_type: pyarrow.DataType, + value: Any = None, + nullable: bool = True, + index: int = -1, +) -> TAnyArrowItem: + """Add column with a single value to the table. + + Args: + item: Arrow table or record batch + name: The new column name + data_type: The data type of the new column + nullable: Whether the new column is nullable + value: The value to fill the new column with + index: The index at which to insert the new column. Defaults to -1 (append) + """ + field = pyarrow.field(name, pyarrow.dictionary(pyarrow.int8(), data_type), nullable=nullable) + if index == -1: + return item.append_column(field, pyarrow.array([value] * item.num_rows, type=field.type)) + return item.add_column(index, field, pyarrow.array([value] * item.num_rows, type=field.type)) + + def pq_stream_with_new_columns( parquet_file: TFileOrPath, columns: TNewColumns, row_groups_per_read: int = 1 ) -> Iterator[pyarrow.Table]: diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 8f95211aa0..48f0d6968e 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -1,8 +1,8 @@ from copy import copy from typing import Set, Dict, Any, Optional, List +from dlt.common.configuration import known_sections, resolve_configuration, with_config from dlt.common import logger -from dlt.common.configuration.inject import with_config from dlt.common.configuration.specs import BaseConfiguration, configspec from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.exceptions import MissingDependencyException @@ -21,6 +21,7 @@ from dlt.extract.resource import DltResource from dlt.extract.items import TableNameMeta from dlt.extract.storage import ExtractorItemStorage +from dlt.normalize.configuration import ItemsNormalizerConfiguration try: from dlt.common.libs import pyarrow @@ -215,13 +216,29 @@ class ObjectExtractor(Extractor): class ArrowExtractor(Extractor): """Extracts arrow data items into parquet. Normalizes arrow items column names. Compares the arrow schema to actual dlt table schema to reorder the columns and to - insert missing columns (without data). + insert missing columns (without data). Adds _dlt_load_id column to the table if + `add_dlt_load_id` is set to True in normalizer config. We do things that normalizer should do here so we do not need to load and save parquet files again later. + Handles the following types: + - `pyarrow.Table` + - `pyarrow.RecordBatch` + - `pandas.DataFrame` (is converted to arrow `Table` before processing) """ + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self._normalize_config = self._retrieve_normalize_config() + + def _retrieve_normalize_config(self) -> ItemsNormalizerConfiguration: + """Get normalizer settings that are used here""" + return resolve_configuration( + ItemsNormalizerConfiguration(), + sections=(known_sections.NORMALIZE, "parquet_normalizer"), + ) + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: static_table_name = self._get_static_table_name(resource, meta) items = [ @@ -294,7 +311,13 @@ def _write_item( columns = columns or self.schema.get_table_columns(table_name) # Note: `items` is always a list here due to the conversion in `write_table` items = [ - pyarrow.normalize_py_arrow_item(item, columns, self.naming, self._caps) + pyarrow.normalize_py_arrow_item( + item, + columns, + self.naming, + self._caps, + load_id=self.load_id if self._normalize_config.add_dlt_load_id else None, + ) for item in items ] # write items one by one @@ -316,8 +339,22 @@ def _compute_table( else: arrow_table = copy(computed_table) arrow_table["columns"] = pyarrow.py_arrow_to_table_schema_columns(item.schema) + + # Add load_id column if needed + dlt_load_id_col = self.naming.normalize_table_identifier("_dlt_load_id") + if ( + self._normalize_config.add_dlt_load_id + and dlt_load_id_col not in arrow_table["columns"] + ): + arrow_table["columns"][dlt_load_id_col] = { + "name": dlt_load_id_col, + "data_type": "text", + "nullable": False, + } + # normalize arrow table before merging arrow_table = self.schema.normalize_table_identifiers(arrow_table) + # issue warnings when overriding computed with arrow override_warn: bool = False for col_name, column in arrow_table["columns"].items(): @@ -343,6 +380,7 @@ def _compute_table( utils.merge_columns( arrow_table["columns"], computed_table["columns"], merge_columns=True ) + return arrow_table def _compute_and_update_table( diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index eed98d7563..6678f6edee 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -228,37 +228,13 @@ class ArrowItemsNormalizer(ItemsNormalizer): REWRITE_ROW_GROUPS = 1 def _write_with_dlt_columns( - self, extracted_items_file: str, root_table_name: str, add_load_id: bool, add_dlt_id: bool + self, extracted_items_file: str, root_table_name: str, add_dlt_id: bool ) -> List[TSchemaUpdate]: new_columns: List[Any] = [] schema = self.schema load_id = self.load_id schema_update: TSchemaUpdate = {} - if add_load_id: - table_update = schema.update_table( - { - "name": root_table_name, - "columns": { - "_dlt_load_id": { - "name": "_dlt_load_id", - "data_type": "text", - "nullable": False, - } - }, - } - ) - table_updates = schema_update.setdefault(root_table_name, []) - table_updates.append(table_update) - load_id_type = pa.dictionary(pa.int8(), pa.string()) - new_columns.append( - ( - -1, - pa.field("_dlt_load_id", load_id_type, nullable=False), - lambda batch: pa.array([load_id] * batch.num_rows, type=load_id_type), - ) - ) - if add_dlt_id: table_update = schema.update_table( { @@ -292,9 +268,9 @@ def _write_with_dlt_columns( items_count += batch.num_rows # we may need to normalize if is_native_arrow_writer and should_normalize is None: - should_normalize, _, _, _, _ = pyarrow.should_normalize_arrow_schema( + should_normalize = pyarrow.should_normalize_arrow_schema( batch.schema, columns_schema, schema.naming - ) + )[0] if should_normalize: logger.info( f"When writing arrow table to {root_table_name} the schema requires" @@ -366,25 +342,22 @@ def __call__(self, extracted_items_file: str, root_table_name: str) -> List[TSch base_schema_update = self._fix_schema_precisions(root_table_name, arrow_schema) add_dlt_id = self.config.parquet_normalizer.add_dlt_id - add_dlt_load_id = self.config.parquet_normalizer.add_dlt_load_id # if we need to add any columns or the file format is not parquet, we can't just import files - must_rewrite = ( - add_dlt_id or add_dlt_load_id or self.item_storage.writer_spec.file_format != "parquet" - ) + must_rewrite = add_dlt_id or self.item_storage.writer_spec.file_format != "parquet" if not must_rewrite: # in rare cases normalization may be needed - must_rewrite, _, _, _, _ = pyarrow.should_normalize_arrow_schema( + must_rewrite = pyarrow.should_normalize_arrow_schema( arrow_schema, self.schema.get_table_columns(root_table_name), self.schema.naming - ) + )[0] if must_rewrite: logger.info( f"Table {root_table_name} parquet file {extracted_items_file} must be rewritten:" - f" add_dlt_id: {add_dlt_id} add_dlt_load_id: {add_dlt_load_id} destination file" + f" add_dlt_id: {add_dlt_id} destination file" f" format: {self.item_storage.writer_spec.file_format} or due to required" " normalization " ) schema_update = self._write_with_dlt_columns( - extracted_items_file, root_table_name, add_dlt_load_id, add_dlt_id + extracted_items_file, root_table_name, add_dlt_id ) return base_schema_update + schema_update diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index 426c090f94..f9ceb99a90 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -84,7 +84,10 @@ add_dlt_load_id = true add_dlt_id = true ``` -Keep in mind that enabling these incurs some performance overhead because the `parquet` file needs to be read back from disk in chunks, processed and rewritten with new columns. +Keep in mind that enabling these incurs some performance overhead: + +- `add_dlt_load_id` has minimal overhead since the column is added to arrow table in memory during `extract` stage, before parquet file is written to disk +- `add_dlt_id` adds the column during `normalize` stage after file has been extracted to disk. The file needs to be read back from disk in chunks, processed and rewritten with new columns ## Incremental loading with Arrow tables diff --git a/tests/common/schema/test_inference.py b/tests/common/schema/test_inference.py index da5c809827..0a40953f53 100644 --- a/tests/common/schema/test_inference.py +++ b/tests/common/schema/test_inference.py @@ -565,3 +565,24 @@ def test_infer_on_incomplete_column(schema: Schema) -> None: assert i_column["x-special"] == "spec" # type: ignore[typeddict-item] assert i_column["primary_key"] is True assert i_column["data_type"] == "text" + + +def test_update_table_adds_at_end(schema: Schema) -> None: + row = {"evm": Wei(1)} + _, new_table = schema.coerce_row("eth", None, row) + schema.update_table(new_table) + schema.update_table( + { + "name": new_table["name"], + "columns": { + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + } + }, + } + ) + table = schema.tables["eth"] + # place new columns at the end + assert list(table["columns"].keys()) == ["evm", "_dlt_load_id"] diff --git a/tests/pipeline/test_arrow_sources.py b/tests/pipeline/test_arrow_sources.py index 667f26476b..0c03a8209d 100644 --- a/tests/pipeline/test_arrow_sources.py +++ b/tests/pipeline/test_arrow_sources.py @@ -505,3 +505,61 @@ def test_empty_arrow(item_type: TPythonTableFormat) -> None: assert len(pipeline.list_extracted_resources()) == 1 norm_info = pipeline.normalize() assert norm_info.row_counts["items"] == 0 + + +@pytest.mark.parametrize("item_type", ["pandas", "arrow-table", "arrow-batch"]) +def test_extract_adds_dlt_load_id(item_type: TPythonTableFormat) -> None: + os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID"] = "True" + os.environ["DESTINATION__LOADER_FILE_FORMAT"] = "parquet" + + item, _, _ = arrow_table_all_data_types(item_type, num_rows=5432) + + @dlt.resource + def some_data(): + yield item + + pipeline: dlt.Pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="duckdb") + info = pipeline.extract(some_data()) + + load_id = info.loads_ids[0] + jobs = info.load_packages[0].jobs["new_jobs"] + extracted_file = [job for job in jobs if "some_data" in job.file_path][0].file_path + + with pa.parquet.ParquetFile(extracted_file) as pq: + tbl = pq.read() + assert len(tbl) == 5432 + + # Extracted file has _dlt_load_id + assert pa.compute.all(pa.compute.equal(tbl["_dlt_load_id"], load_id)).as_py() + + # Load ID in both schema and arrow tbl should be the last column + assert tbl.schema.names[-1] == "_dlt_load_id" + cols = list(pipeline.default_schema.tables["some_data"]["columns"]) + assert cols[-1] == "_dlt_load_id" + + +def test_extract_json_normalize_parquet_adds_dlt_load_id(): + """Extract jsonl data that gets written to parquet in normalizer. Check that _dlt_load_id is added.""" + os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID"] = "True" + + rows, _, _ = arrow_table_all_data_types("object", num_rows=1001) + + @dlt.resource + def some_data(): + yield rows + + pipeline: dlt.Pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="duckdb") + + pipeline.extract(some_data()) + n_info = pipeline.normalize(loader_file_format="parquet") + + load_id = n_info.loads_ids[0] + jobs = n_info.load_packages[0].jobs["new_jobs"] + normalized_file = [job for job in jobs if "some_data" in job.file_path][0].file_path + + with pa.parquet.ParquetFile(normalized_file) as pq: + tbl = pq.read() + assert len(tbl) == 1001 + + # Normalized file has _dlt_load_id + assert pa.compute.all(pa.compute.equal(tbl["_dlt_load_id"], load_id)).as_py() From d4b0bd0dc0e3948dfcb6620ecaab19ed7323da89 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Wed, 19 Jun 2024 07:32:40 +0200 Subject: [PATCH 27/61] set default next item mode to round robin (#1482) * set default next item mode to round robin * fix sources tests * fix some existing tests to work again * parametrize one test --- dlt/extract/pipe_iterator.py | 7 +++---- docs/website/docs/reference/performance.md | 8 +++----- tests/extract/test_decorators.py | 8 ++++++-- tests/extract/test_extract_pipe.py | 11 ++++++++--- tests/extract/test_incremental.py | 8 ++++++++ tests/extract/test_sources.py | 10 +++++++++- 6 files changed, 37 insertions(+), 15 deletions(-) diff --git a/dlt/extract/pipe_iterator.py b/dlt/extract/pipe_iterator.py index 1edd9bd039..3a10f651c0 100644 --- a/dlt/extract/pipe_iterator.py +++ b/dlt/extract/pipe_iterator.py @@ -51,8 +51,7 @@ class PipeIteratorConfiguration(BaseConfiguration): workers: int = 5 futures_poll_interval: float = 0.01 copy_on_fork: bool = False - next_item_mode: str = "fifo" - + next_item_mode: str = "round_robin" __section__: ClassVar[str] = known_sections.EXTRACT def __init__( @@ -82,7 +81,7 @@ def from_pipe( max_parallel_items: int = 20, workers: int = 5, futures_poll_interval: float = 0.01, - next_item_mode: TPipeNextItemMode = "fifo", + next_item_mode: TPipeNextItemMode = "round_robin", ) -> "PipeIterator": # join all dependent pipes if pipe.parent: @@ -109,7 +108,7 @@ def from_pipes( workers: int = 5, futures_poll_interval: float = 0.01, copy_on_fork: bool = False, - next_item_mode: TPipeNextItemMode = "fifo", + next_item_mode: TPipeNextItemMode = "round_robin", ) -> "PipeIterator": # print(f"max_parallel_items: {max_parallel_items} workers: {workers}") sources: List[SourcePipeItem] = [] diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 407baf9d97..7d8280d8ee 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -217,13 +217,11 @@ call `pipeline.activate()` to inject the right context into current thread. ## Resources extraction, `fifo` vs. `round robin` When extracting from resources, you have two options to determine what the order of queries to your -resources are: `fifo` and `round_robin`. +resources are: `round_robin` and `fifo`. -`fifo` is the default option and will result in every resource being fully extracted before the next -resource is extracted in the order that you added them to your source. +`round_robin` is the default option and will result in extraction of one item from the first resource, then one item from the second resource etc, doing as many rounds as necessary until all resources are fully extracted. If you want to extract resources in parallel, you will need to keep `round_robin`. -`round_robin` will result in extraction of one item from the first resource, then one item from the -second resource etc, doing as many rounds as necessary until all resources are fully extracted. +`fifo` is an option for sequential extraction. It will result in every resource being fully extracted until the resource generator is expired, or a configured limit is reached, then the next resource will be evaluated. Resources are extracted in the order that you added them to your source. You can change this setting in your `config.toml` as follows: diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 1cf14abe55..db888c95e4 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -812,7 +812,10 @@ def standalone_transformer_returns(item: TDataItem, init: int = dlt.config.value return "A" * item * init -def test_standalone_transformer() -> None: +@pytest.mark.parametrize("next_item_mode", ["fifo", "round_robin"]) +def test_standalone_transformer(next_item_mode: str) -> None: + os.environ["EXTRACT__NEXT_ITEM_MODE"] = next_item_mode + assert not isinstance(standalone_transformer, DltResource) assert callable(standalone_transformer) assert standalone_transformer.__doc__ == """Has fine transformer docstring""" @@ -824,7 +827,8 @@ def test_standalone_transformer() -> None: bound_tx(1) assert isinstance(bound_tx, DltResource) # the resource sets the start of the range of transformer + transformer init - assert list(standalone_signature(1, 3) | bound_tx) == [6, 7, 8, 9, 7, 8, 9] + exp_result = [6, 7, 7, 8, 8, 9, 9] if next_item_mode == "round_robin" else [6, 7, 8, 9, 7, 8, 9] + assert list(standalone_signature(1, 3) | bound_tx) == exp_result # wrong params to transformer with pytest.raises(TypeError): diff --git a/tests/extract/test_extract_pipe.py b/tests/extract/test_extract_pipe.py index 68c1c82124..9bf580b76a 100644 --- a/tests/extract/test_extract_pipe.py +++ b/tests/extract/test_extract_pipe.py @@ -38,7 +38,7 @@ def get_pipes(): Pipe.from_data("data3", source_gen3()), ] - # default mode is "fifo" + # test both modes _l = list(PipeIterator.from_pipes(get_pipes(), next_item_mode="fifo")) # items will be in order of the pipes, nested iterator items appear inline, None triggers a bit of rotation assert [pi.item for pi in _l] == [1, 2, 3, 4, 10, 5, 6, 8, 7, 9, 11, 12, 13, 14, 15] @@ -53,6 +53,11 @@ def get_pipes(): # items will be in order of the pipes, nested iterator items appear inline, None triggers rotation assert [pi.item for pi in _l] == [1, 12, 14, 2, 13, 15, 3, 10, 4, 11, 5, 6, 8, 9, 7] + # default is round robin, should have same result without explicit + _l = list(PipeIterator.from_pipes(get_pipes())) + # items will be in order of the pipes, nested iterator items appear inline, None triggers rotation + assert [pi.item for pi in _l] == [1, 12, 14, 2, 13, 15, 3, 10, 4, 11, 5, 6, 8, 9, 7] + # round robin with max parallel items triggers strict fifo in some cases (after gen2 and 3 are exhausted we already have the first yielded gen, # items appear in order as sources are processed strictly from front) _l = list( @@ -460,7 +465,7 @@ def test_yield_map_step() -> None: p = Pipe.from_data("data", [1, 2, 3]) # this creates number of rows as passed by the data p.append_step(YieldMapItem(lambda item: (yield from [f"item_{x}" for x in range(item)]))) - assert _f_items(list(PipeIterator.from_pipe(p))) == [ + assert _f_items(list(PipeIterator.from_pipe(p, next_item_mode="fifo"))) == [ "item_0", "item_0", "item_1", @@ -476,7 +481,7 @@ def test_yield_map_step() -> None: p.append_step( YieldMapItem(lambda item, meta: (yield from [f"item_{meta}_{x}" for x in range(item)])) ) - assert _f_items(list(PipeIterator.from_pipe(p))) == [ + assert _f_items(list(PipeIterator.from_pipe(p, next_item_mode="fifo"))) == [ "item_A_0", "item_B_0", "item_B_1", diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 675f44bb14..bb6fb70983 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -42,6 +42,14 @@ ) +@pytest.fixture(autouse=True) +def switch_to_fifo(): + """most of the following tests rely on the old default fifo next item mode""" + os.environ["EXTRACT__NEXT_ITEM_MODE"] = "fifo" + yield + del os.environ["EXTRACT__NEXT_ITEM_MODE"] + + def test_detect_incremental_arg() -> None: def incr_1(incremental: dlt.sources.incremental): # type: ignore[type-arg] pass diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index e40b03219d..7b2613776d 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -4,7 +4,7 @@ import pytest import asyncio -import dlt +import dlt, os from dlt.common.configuration.container import Container from dlt.common.exceptions import DictValidationException, PipelineStateNotAvailable from dlt.common.pipeline import StateInjectableContext, source_state @@ -31,6 +31,14 @@ from dlt.extract.pipe import Pipe +@pytest.fixture(autouse=True) +def switch_to_fifo(): + """most of the following tests rely on the old default fifo next item mode""" + os.environ["EXTRACT__NEXT_ITEM_MODE"] = "fifo" + yield + del os.environ["EXTRACT__NEXT_ITEM_MODE"] + + def test_call_data_resource() -> None: with pytest.raises(TypeError): DltResource.from_data([1], name="t")() From 37f64a13ad64b289b14fcc7d5e4334806c8d5c88 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Thu, 20 Jun 2024 17:37:12 +0200 Subject: [PATCH 28/61] Fix/1465 fixes snowflake auth credentials (#1489) * correctly handles explicit initial values, still allowing optional args to be resolved * allows pure authenticator, allows to specify token right in the credentials * adds base method to generate typed query params in connection string credentials, serializes to str for to_url * drops resolve() from __init__ and parse_native_value methods * updates snowflake docs * runs native value parsing for side effects --- dlt/common/configuration/resolve.py | 17 ++- .../configuration/specs/base_configuration.py | 57 +++---- .../specs/connection_string_credentials.py | 25 +++- dlt/common/configuration/utils.py | 6 +- .../impl/clickhouse/configuration.py | 28 ++-- dlt/destinations/impl/mssql/configuration.py | 10 +- .../impl/postgres/configuration.py | 12 +- .../impl/snowflake/configuration.py | 77 +++++----- dlt/extract/incremental/__init__.py | 6 +- .../dlt-ecosystem/destinations/snowflake.md | 25 +++- .../configuration/test_configuration.py | 38 +++++ .../common/configuration/test_credentials.py | 41 ++++- .../configuration/test_toml_provider.py | 4 +- tests/load/duckdb/test_motherduck_client.py | 2 +- .../snowflake/test_snowflake_configuration.py | 141 +++++++++++++++--- 15 files changed, 352 insertions(+), 137 deletions(-) diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index 9a4373039b..ee8a1f6029 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -126,14 +126,21 @@ def _maybe_parse_native_value( not isinstance(explicit_value, C_Mapping) or isinstance(explicit_value, BaseConfiguration) ): try: + # parse the native value anyway because there are configs with side effects config.parse_native_representation(explicit_value) + default_value = config.__class__() + # parse native value and convert it into dict, extract the diff and use it as exact value + # NOTE: as those are the same dataclasses, the set of keys must be the same + explicit_value = { + k: v + for k, v in config.__class__.from_init_value(explicit_value).items() + if default_value[k] != v + } except ValueError as v_err: # provide generic exception raise InvalidNativeValue(type(config), type(explicit_value), embedded_sections, v_err) except NotImplementedError: pass - # explicit value was consumed - explicit_value = None return explicit_value @@ -336,7 +343,11 @@ def _resolve_config_field( # print(f"{embedded_config} IS RESOLVED with VALUE {value}") # injected context will be resolved if value is not None: - _maybe_parse_native_value(embedded_config, value, embedded_sections + (key,)) + from_native_explicit = _maybe_parse_native_value( + embedded_config, value, embedded_sections + (key,) + ) + if from_native_explicit is not value: + embedded_config.update(from_native_explicit) value = embedded_config else: # only config with sections may look for initial values diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 1751b6ae13..2504fdeaef 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -49,8 +49,7 @@ # forward class declaration _F_BaseConfiguration: Any = type(object) _F_ContainerInjectableContext: Any = type(object) -_T = TypeVar("_T", bound="BaseConfiguration") -_C = TypeVar("_C", bound="CredentialsConfiguration") +_B = TypeVar("_B", bound="BaseConfiguration") class NotResolved: @@ -289,6 +288,33 @@ class BaseConfiguration(MutableMapping[str, Any]): """Typing for dataclass fields""" __hint_resolvers__: ClassVar[Dict[str, Callable[["BaseConfiguration"], Type[Any]]]] = {} + @classmethod + def from_init_value(cls: Type[_B], init_value: Any = None) -> _B: + """Initializes credentials from `init_value` + + Init value may be a native representation of the credentials or a dict. In case of native representation (for example a connection string or JSON with service account credentials) + a `parse_native_representation` method will be used to parse it. In case of a dict, the credentials object will be updated with key: values of the dict. + Unexpected values in the dict will be ignored. + + Credentials will be marked as resolved if all required fields are set resolve() method is successful + """ + # create an instance + self = cls() + self._apply_init_value(init_value) + if not self.is_partial(): + # let it fail gracefully + with contextlib.suppress(Exception): + self.resolve() + return self + + def _apply_init_value(self, init_value: Any = None) -> None: + if isinstance(init_value, C_Mapping): + self.update(init_value) + elif init_value is not None: + self.parse_native_representation(init_value) + else: + return + def parse_native_representation(self, native_value: Any) -> None: """Initialize the configuration fields by parsing the `native_value` which should be a native representation of the configuration or credentials, for example database connection string or JSON serialized GCP service credentials file. @@ -348,7 +374,7 @@ def resolve(self) -> None: self.call_method_in_mro("on_resolved") self.__is_resolved__ = True - def copy(self: _T) -> _T: + def copy(self: _B) -> _B: """Returns a deep copy of the configuration instance""" return copy.deepcopy(self) @@ -426,21 +452,6 @@ class CredentialsConfiguration(BaseConfiguration): __section__: ClassVar[str] = "credentials" - @classmethod - def from_init_value(cls: Type[_C], init_value: Any = None) -> _C: - """Initializes credentials from `init_value` - - Init value may be a native representation of the credentials or a dict. In case of native representation (for example a connection string or JSON with service account credentials) - a `parse_native_representation` method will be used to parse it. In case of a dict, the credentials object will be updated with key: values of the dict. - Unexpected values in the dict will be ignored. - - Credentials will be marked as resolved if all required fields are set. - """ - # create an instance - self = cls() - self._apply_init_value(init_value) - return self - def to_native_credentials(self) -> Any: """Returns native credentials object. @@ -448,16 +459,6 @@ def to_native_credentials(self) -> Any: """ return self.to_native_representation() - def _apply_init_value(self, init_value: Any = None) -> None: - if isinstance(init_value, C_Mapping): - self.update(init_value) - elif init_value is not None: - self.parse_native_representation(init_value) - else: - return - if not self.is_partial(): - self.resolve() - def __str__(self) -> str: """Get string representation of credentials to be displayed, with all secret parts removed""" return super().__str__() diff --git a/dlt/common/configuration/specs/connection_string_credentials.py b/dlt/common/configuration/specs/connection_string_credentials.py index 2691c5d886..5b9a4587c7 100644 --- a/dlt/common/configuration/specs/connection_string_credentials.py +++ b/dlt/common/configuration/specs/connection_string_credentials.py @@ -15,7 +15,7 @@ class ConnectionStringCredentials(CredentialsConfiguration): username: str = None host: Optional[str] = None port: Optional[int] = None - query: Optional[Dict[str, str]] = None + query: Optional[Dict[str, Any]] = None __config_gen_annotations__: ClassVar[List[str]] = ["port", "password", "host"] @@ -44,7 +44,22 @@ def on_resolved(self) -> None: def to_native_representation(self) -> str: return self.to_url().render_as_string(hide_password=False) + def get_query(self) -> Dict[str, Any]: + """Gets query preserving parameter types. Mostly used internally to export connection params""" + return {} if self.query is None else self.query + def to_url(self) -> URL: + """Creates SQLAlchemy compatible URL object, computes current query via `get_query` and serializes its values to str""" + # circular dependencies here + from dlt.common.configuration.utils import serialize_value + + def _serialize_value(v_: Any) -> str: + if v_ is None: + return None + return serialize_value(v_) + + # query must be str -> str + query = {k: _serialize_value(v) for k, v in self.get_query().items()} return URL.create( self.drivername, self.username, @@ -52,8 +67,12 @@ def to_url(self) -> URL: self.host, self.port, self.database, - self.query, + query, ) def __str__(self) -> str: - return self.to_url().render_as_string(hide_password=True) + url = self.to_url() + # do not display query. it often contains secret values + url = url._replace(query=None) + # we only have control over netloc/path + return url.render_as_string(hide_password=True) diff --git a/dlt/common/configuration/utils.py b/dlt/common/configuration/utils.py index 6402afcfbe..74190a87de 100644 --- a/dlt/common/configuration/utils.py +++ b/dlt/common/configuration/utils.py @@ -100,7 +100,7 @@ def deserialize_value(key: str, value: Any, hint: Type[TAny]) -> TAny: raise ConfigValueCannotBeCoercedException(key, value, hint) from exc -def serialize_value(value: Any) -> Any: +def serialize_value(value: Any) -> str: if value is None: raise ValueError(value) # return literal for tuples @@ -108,13 +108,13 @@ def serialize_value(value: Any) -> Any: return str(value) if isinstance(value, BaseConfiguration): try: - return value.to_native_representation() + return str(value.to_native_representation()) except NotImplementedError: # no native representation: use dict value = dict(value) # coerce type to text which will use json for mapping and sequences value_dt = py_type_to_sc_type(type(value)) - return coerce_value("text", value_dt, value) + return coerce_value("text", value_dt, value) # type: ignore[no-any-return] def auto_cast(value: str) -> Any: diff --git a/dlt/destinations/impl/clickhouse/configuration.py b/dlt/destinations/impl/clickhouse/configuration.py index bbff6e0a9c..483356f9f9 100644 --- a/dlt/destinations/impl/clickhouse/configuration.py +++ b/dlt/destinations/impl/clickhouse/configuration.py @@ -1,5 +1,5 @@ import dataclasses -from typing import ClassVar, List, Any, Final, Literal, cast, Optional +from typing import ClassVar, Dict, List, Any, Final, Literal, cast, Optional from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials @@ -59,23 +59,21 @@ def parse_native_representation(self, native_value: Any) -> None: self.query.get("send_receive_timeout", self.send_receive_timeout) ) self.secure = cast(TSecureConnection, int(self.query.get("secure", self.secure))) - if not self.is_partial(): - self.resolve() - def to_url(self) -> URL: - url = super().to_url() - url = url.update_query_pairs( - [ - ("connect_timeout", str(self.connect_timeout)), - ("send_receive_timeout", str(self.send_receive_timeout)), - ("secure", str(1) if self.secure else str(0)), + def get_query(self) -> Dict[str, Any]: + query = dict(super().get_query()) + query.update( + { + "connect_timeout": str(self.connect_timeout), + "send_receive_timeout": str(self.send_receive_timeout), + "secure": 1 if self.secure else 0, # Toggle experimental settings. These are necessary for certain datatypes and not optional. - ("allow_experimental_lightweight_delete", "1"), - # ("allow_experimental_object_type", "1"), - ("enable_http_compression", "1"), - ] + "allow_experimental_lightweight_delete": 1, + # "allow_experimental_object_type": 1, + "enable_http_compression": 1, + } ) - return url + return query @configspec diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index 1d085f40c1..8a50ecc6d2 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -34,8 +34,6 @@ def parse_native_representation(self, native_value: Any) -> None: self.query = {k.lower(): v for k, v in self.query.items()} # Make case-insensitive. self.driver = self.query.get("driver", self.driver) self.connect_timeout = int(self.query.get("connect_timeout", self.connect_timeout)) - if not self.is_partial(): - self.resolve() def on_resolved(self) -> None: if self.driver not in self.SUPPORTED_DRIVERS: @@ -45,10 +43,10 @@ def on_resolved(self) -> None: ) self.database = self.database.lower() - def to_url(self) -> URL: - url = super().to_url() - url.update_query_pairs([("connect_timeout", str(self.connect_timeout))]) - return url + def get_query(self) -> Dict[str, Any]: + query = dict(super().get_query()) + query["connect_timeout"] = self.connect_timeout + return query def on_partial(self) -> None: self.driver = self._get_driver() diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index 0d12abbac7..ae0b5200b2 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -1,5 +1,5 @@ import dataclasses -from typing import Final, ClassVar, Any, List, TYPE_CHECKING, Union +from typing import Dict, Final, ClassVar, Any, List, TYPE_CHECKING, Union from dlt.common.libs.sql_alchemy import URL from dlt.common.configuration import configspec @@ -23,13 +23,11 @@ class PostgresCredentials(ConnectionStringCredentials): def parse_native_representation(self, native_value: Any) -> None: super().parse_native_representation(native_value) self.connect_timeout = int(self.query.get("connect_timeout", self.connect_timeout)) - if not self.is_partial(): - self.resolve() - def to_url(self) -> URL: - url = super().to_url() - url.update_query_pairs([("connect_timeout", str(self.connect_timeout))]) - return url + def get_query(self) -> Dict[str, Any]: + query = dict(super().get_query()) + query["connect_timeout"] = self.connect_timeout + return query @configspec diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index c8cc805712..8529fbe5c8 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -13,8 +13,8 @@ from dlt.common.utils import digest128 -def _read_private_key(private_key: str, password: Optional[str] = None) -> bytes: - """Load an encrypted or unencrypted private key from string.""" +def _decode_private_key(private_key: str, password: Optional[str] = None) -> bytes: + """Decode encrypted or unencrypted private key from string.""" try: from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives.asymmetric import rsa @@ -61,67 +61,64 @@ class SnowflakeCredentials(ConnectionStringCredentials): warehouse: Optional[str] = None role: Optional[str] = None authenticator: Optional[str] = None + token: Optional[str] = None private_key: Optional[TSecretStrValue] = None private_key_passphrase: Optional[TSecretStrValue] = None application: Optional[str] = SNOWFLAKE_APPLICATION_ID __config_gen_annotations__: ClassVar[List[str]] = ["password", "warehouse", "role"] + __query_params__: ClassVar[List[str]] = [ + "warehouse", + "role", + "authenticator", + "token", + "private_key", + "private_key_passphrase", + ] def parse_native_representation(self, native_value: Any) -> None: super().parse_native_representation(native_value) - self.warehouse = self.query.get("warehouse") - self.role = self.query.get("role") - self.private_key = self.query.get("private_key") # type: ignore - self.private_key_passphrase = self.query.get("private_key_passphrase") # type: ignore - if not self.is_partial() and (self.password or self.private_key): - self.resolve() + for param in self.__query_params__: + if param in self.query: + setattr(self, param, self.query.get(param)) + + # if not self.is_partial() and (self.password or self.private_key): + # self.resolve() def on_resolved(self) -> None: - if not self.password and not self.private_key: + if not self.password and not self.private_key and not self.authenticator: raise ConfigurationValueError( - "Please specify password or private_key. SnowflakeCredentials supports password and" - " private key authentication and one of those must be specified." + "Please specify password or private_key or authenticator fields." + " SnowflakeCredentials supports password, private key and authenticator based (ie." + " oauth2) authentication and one of those must be specified." ) - def to_url(self) -> URL: - query = dict(self.query or {}) - if self.warehouse and "warehouse" not in query: - query["warehouse"] = self.warehouse - if self.role and "role" not in query: - query["role"] = self.role - - if self.application != "" and "application" not in query: - query["application"] = self.application - - return URL.create( - self.drivername, - self.username, - self.password, - self.host, - self.port, - self.database, - query, - ) + def get_query(self) -> Dict[str, Any]: + query = dict(super().get_query() or {}) + for param in self.__query_params__: + if self.get(param, None) is not None: + query[param] = self[param] + return query def to_connector_params(self) -> Dict[str, Any]: - private_key: Optional[bytes] = None + # gather all params in query + query = self.get_query() if self.private_key: - private_key = _read_private_key(self.private_key, self.private_key_passphrase) + query["private_key"] = _decode_private_key( + self.private_key, self.private_key_passphrase + ) - conn_params = dict( - self.query or {}, + # we do not want passphrase to be passed + query.pop("private_key_passphrase", None) + + conn_params: Dict[str, Any] = dict( + query, user=self.username, password=self.password, account=self.host, database=self.database, - warehouse=self.warehouse, - role=self.role, - private_key=private_key, ) - if self.authenticator: - conn_params["authenticator"] = self.authenticator - if self.application != "" and "application" not in conn_params: conn_params["application"] = self.application diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index bcb6b1cc9a..bc25c6fee1 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -269,11 +269,10 @@ def parse_native_representation(self, native_value: Any) -> None: self._primary_key = merged._primary_key self.allow_external_schedulers = merged.allow_external_schedulers self.row_order = merged.row_order + self.__is_resolved__ = self.__is_resolved__ else: # TODO: Maybe check if callable(getattr(native_value, '__lt__', None)) # Passing bare value `incremental=44` gets parsed as initial_value self.initial_value = native_value - if not self.is_partial(): - self.resolve() def get_state(self) -> IncrementalColumnState: """Returns an Incremental state for a particular cursor column""" @@ -491,7 +490,8 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: return rows -Incremental.EMPTY = Incremental[Any]("") +Incremental.EMPTY = Incremental[Any]() +Incremental.EMPTY.__is_resolved__ = True class IncrementalResourceWrapper(ItemTransform[TDataItem]): diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index deaaff3562..c9d70f65fe 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -71,9 +71,10 @@ You can also decrease the suspend time for your warehouse to 1 minute (**Admin** ### Authentication types Snowflake destination accepts three authentication types: +Snowflake destination accepts three authentication types: - password authentication - [key pair authentication](https://docs.snowflake.com/en/user-guide/key-pair-auth) -- external authentication +- oauth authentication The **password authentication** is not any different from other databases like Postgres or Redshift. `dlt` follows the same syntax as the [SQLAlchemy dialect](https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#required-parameters). @@ -100,16 +101,32 @@ If you pass a passphrase in the connection string, please URL encode it. destination.snowflake.credentials="snowflake://loader:@kgiotue-wn98412/dlt_data?private_key=&private_key_passphrase=" ``` -In **external authentication**, you can use an OAuth provider like Okta or an external browser to authenticate. You pass your authenticator and refresh token as below: +In **oauth authentication**, you can use an OAuth provider like Snowflake, Okta or an external browser to authenticate. In case of Snowflake oauth, you pass your `authenticator` and refresh `token` as below: ```toml [destination.snowflake.credentials] database = "dlt_data" username = "loader" -authenticator="..." +authenticator="oauth" token="..." ``` or in the connection string as query parameters. -Refer to Snowflake [OAuth](https://docs.snowflake.com/en/user-guide/oauth-intro) for more details. + +In case of external authentication, you need to find documentation for your OAuth provider. Refer to Snowflake [OAuth](https://docs.snowflake.com/en/user-guide/oauth-intro) for more details. + +### Additional connection options +We pass all query parameters to `connect` function of Snowflake Python Connector. For example: +```toml +[destination.snowflake.credentials] +database = "dlt_data" +authenticator="oauth" +[destination.snowflake.credentials.query] +timezone="UTC" +# keep session alive beyond 4 hours +client_session_keep_alive=true +``` +Will set the timezone and session keep alive. Mind that if you use `toml` your configuration is typed. The alternative: +`"snowflake://loader/dlt_data?authenticator=oauth&timezone=UTC&client_session_keep_alive=true"` +will pass `client_session_keep_alive` as string to the connect method (which we didn't verify if it works). ## Write disposition All write dispositions are supported. diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index e7083956b3..48993971c2 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -63,6 +63,7 @@ ) from dlt.common.pipeline import TRefreshMode +from dlt.destinations.impl.postgres.configuration import PostgresCredentials from tests.utils import preserve_environ from tests.common.configuration.utils import ( MockProvider, @@ -450,6 +451,43 @@ def test_invalid_native_config_value() -> None: assert py_ex.value.embedded_sections == () +def test_maybe_use_explicit_value() -> None: + # pass through dict and configs + c = ConnectionStringCredentials() + dict_explicit = {"explicit": "is_dict"} + config_explicit = BaseConfiguration() + assert resolve._maybe_parse_native_value(c, dict_explicit, ()) is dict_explicit + assert resolve._maybe_parse_native_value(c, config_explicit, ()) is config_explicit + + # postgres credentials have a default parameter (connect_timeout), which must be removed for explicit value + pg_c = PostgresCredentials() + explicit_value = resolve._maybe_parse_native_value( + pg_c, "postgres://loader@localhost:5432/dlt_data?a=b&c=d", () + ) + # NOTE: connect_timeout and password are not present + assert explicit_value == { + "drivername": "postgres", + "database": "dlt_data", + "username": "loader", + "host": "localhost", + "query": {"a": "b", "c": "d"}, + } + pg_c = PostgresCredentials() + explicit_value = resolve._maybe_parse_native_value( + pg_c, "postgres://loader@localhost:5432/dlt_data?connect_timeout=33", () + ) + assert explicit_value["connect_timeout"] == 33 + + +def test_optional_params_resolved_if_complete_native_value(environment: Any) -> None: + # this native value fully resolves configuration + environment["CREDENTIALS"] = "postgres://loader:pwd@localhost:5432/dlt_data?a=b&c=d" + # still this config value will be injected + environment["CREDENTIALS__CONNECT_TIMEOUT"] = "300" + c = resolve.resolve_configuration(PostgresCredentials()) + assert c.connect_timeout == 300 + + def test_on_resolved(environment: Any) -> None: with pytest.raises(RuntimeError): # head over hells diff --git a/tests/common/configuration/test_credentials.py b/tests/common/configuration/test_credentials.py index 7c184c16e5..d382a95a44 100644 --- a/tests/common/configuration/test_credentials.py +++ b/tests/common/configuration/test_credentials.py @@ -21,7 +21,8 @@ ) from dlt.common.configuration.specs.run_configuration import RunConfiguration -from tests.utils import preserve_environ +from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials +from tests.utils import TEST_DICT_CONFIG_PROVIDER, preserve_environ from tests.common.utils import json_case_path from tests.common.configuration.utils import environment @@ -64,6 +65,17 @@ """ % OAUTH_USER_INFO +def test_credentials_resolve_from_init_value() -> None: + c = SnowflakeCredentials.from_init_value("snowflake://loader:pass@localhost:5432/dlt_data") + assert c.is_resolved() + # incomplete not resolved + c = SnowflakeCredentials.from_init_value("snowflake://loader:pass@localhost") + assert c.is_resolved() is False + # invalid configuration that raises on resolve() + c = SnowflakeCredentials.from_init_value("snowflake://loader@localhost/dlt_data") + assert c.is_resolved() is False + + def test_connection_string_credentials_native_representation(environment) -> None: with pytest.raises(InvalidConnectionString): ConnectionStringCredentials().parse_native_representation(1) @@ -158,10 +170,10 @@ def test_connection_string_resolved_from_native_representation_env(environment: assert c.host == "aws.12.1" -def test_connection_string_from_init() -> None: +def test_connection_string_initializer() -> None: c = ConnectionStringCredentials("postgres://loader:pass@localhost:5432/dlt_data?a=b&c=d") assert c.drivername == "postgres" - assert c.is_resolved() + assert not c.is_resolved() assert not c.is_partial() c = ConnectionStringCredentials( @@ -182,10 +194,31 @@ def test_connection_string_from_init() -> None: assert c.port == 5432 assert c.database == "dlt_data" assert c.query == {"a": "b", "c": "d"} - assert c.is_resolved() + assert not c.is_resolved() assert not c.is_partial() +def test_query_additional_params() -> None: + c = ConnectionStringCredentials("snowflake://user1:pass1@host1/db1?keep_alive=true") + assert c.query["keep_alive"] == "true" + assert c.to_url().query["keep_alive"] == "true" + + # try a typed param + with TEST_DICT_CONFIG_PROVIDER().values({"credentials": {"query": {"keep_alive": True}}}): + c = ConnectionStringCredentials("snowflake://user1:pass1@host1/db1") + assert c.is_resolved() is False + c = resolve_configuration(c) + assert c.query["keep_alive"] is True + assert c.get_query()["keep_alive"] is True + assert c.to_url().query["keep_alive"] == "True" + + +def test_connection_string_str_repr() -> None: + c = ConnectionStringCredentials("postgres://loader:pass@localhost:5432/dlt_data?a=b&c=d") + # password and query string redacted + assert str(c) == "postgres://loader:***@localhost:5432/dlt_data" + + def test_gcp_service_credentials_native_representation(environment) -> None: with pytest.raises(InvalidGoogleNativeCredentialsType): GcpServiceAccountCredentials().parse_native_representation(1) diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index 43bad21ece..ccc73a30c0 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -219,8 +219,8 @@ def test_secrets_toml_credentials_from_native_repr( " KEY-----\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD+S\n...\n-----END" " PRIVATE KEY-----\n" ) - # but project id got overridden from credentials.project_id - assert c.project_id.endswith("-credentials") + # project id taken from the same value, will not be overridden from any other configs + assert c.project_id.endswith("mock-project-id-source.credentials") # also try sql alchemy url (native repr) c2 = resolve.resolve_configuration(ConnectionStringCredentials(), sections=("databricks",)) assert c2.drivername == "databricks+connector" diff --git a/tests/load/duckdb/test_motherduck_client.py b/tests/load/duckdb/test_motherduck_client.py index 15326c89dc..2a1d703c87 100644 --- a/tests/load/duckdb/test_motherduck_client.py +++ b/tests/load/duckdb/test_motherduck_client.py @@ -28,7 +28,7 @@ def test_motherduck_configuration() -> None: assert cred.password == "TOKEN" assert cred.database == "dlt_data" assert cred.is_partial() is False - assert cred.is_resolved() is True + assert cred.is_resolved() is False cred = MotherDuckCredentials() cred.parse_native_representation("md:///?token=TOKEN") diff --git a/tests/load/snowflake/test_snowflake_configuration.py b/tests/load/snowflake/test_snowflake_configuration.py index 610aab7c20..691f0b5a64 100644 --- a/tests/load/snowflake/test_snowflake_configuration.py +++ b/tests/load/snowflake/test_snowflake_configuration.py @@ -1,10 +1,14 @@ import os import pytest from pathlib import Path -from dlt.common.libs.sql_alchemy import make_url +from urllib3.util import parse_url + +from dlt.common.configuration.utils import add_config_to_env +from tests.utils import TEST_DICT_CONFIG_PROVIDER pytest.importorskip("snowflake") +from dlt.common.libs.sql_alchemy import make_url from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.common.utils import digest128 @@ -20,12 +24,20 @@ # mark all tests as essential, do not remove pytestmark = pytest.mark.essential +# PEM key +PKEY_PEM_STR = Path("./tests/common/cases/secrets/encrypted-private-key").read_text("utf8") +# base64 encoded DER key +PKEY_DER_STR = Path("./tests/common/cases/secrets/encrypted-private-key-base64").read_text("utf8") + +PKEY_PASSPHRASE = "12345" + def test_connection_string_with_all_params() -> None: - url = "snowflake://user1:pass1@host1/db1?application=dltHub_dlt&warehouse=warehouse1&role=role1&private_key=cGs%3D&private_key_passphrase=paphr" + url = "snowflake://user1:pass1@host1/db1?warehouse=warehouse1&role=role1&private_key=cGs%3D&private_key_passphrase=paphr&authenticator=oauth&token=TOK" creds = SnowflakeCredentials() creds.parse_native_representation(url) + assert not creds.is_resolved() assert creds.database == "db1" assert creds.username == "user1" @@ -35,6 +47,8 @@ def test_connection_string_with_all_params() -> None: assert creds.role == "role1" assert creds.private_key == "cGs=" assert creds.private_key_passphrase == "paphr" + assert creds.authenticator == "oauth" + assert creds.token == "TOK" expected = make_url(url) to_url_value = str(creds.to_url()) @@ -43,23 +57,103 @@ def test_connection_string_with_all_params() -> None: assert make_url(creds.to_native_representation()) == expected assert to_url_value == str(expected) + +def test_custom_application(): + creds = SnowflakeCredentials() creds.application = "custom" - url = "snowflake://user1:pass1@host1/db1?application=custom&warehouse=warehouse1&role=role1&private_key=cGs%3D&private_key_passphrase=paphr" + url = "snowflake://user1:pass1@host1/db1?authenticator=oauth&warehouse=warehouse1&role=role1&private_key=cGs%3D&private_key_passphrase=paphr&token=TOK" creds.parse_native_representation(url) + assert not creds.is_resolved() expected = make_url(url) to_url_value = str(creds.to_url()) assert make_url(creds.to_native_representation()) == expected assert to_url_value == str(expected) - assert "application=custom" in str(expected) + assert "application=custom" not in str(expected) -def test_to_connector_params() -> None: - # PEM key - pkey_str = Path("./tests/common/cases/secrets/encrypted-private-key").read_text("utf8") +def test_set_all_from_env(environment) -> None: + url = "snowflake://user1:pass1@host1/db1?authenticator=oauth&warehouse=warehouse1&role=role1&private_key=cGs%3D&private_key_passphrase=paphr&token=TOK" + c = SnowflakeCredentials(url) + add_config_to_env(c) + # resolve from environments + creds = resolve_configuration(SnowflakeCredentials()) + assert creds.is_resolved() + assert creds.database == "db1" + assert creds.username == "user1" + assert creds.password == "pass1" + assert creds.host == "host1" + assert creds.warehouse == "warehouse1" + assert creds.role == "role1" + assert creds.private_key == "cGs=" + assert creds.private_key_passphrase == "paphr" + assert creds.authenticator == "oauth" + assert creds.token == "TOK" + +def test_only_authenticator() -> None: + url = "snowflake://user1@host1/db1" + # password, pk or authenticator must be specified + with pytest.raises(ConfigurationValueError): + resolve_configuration(SnowflakeCredentials(url)) + c = resolve_configuration(SnowflakeCredentials("snowflake://user1@host1/db1?authenticator=uri")) + assert c.authenticator == "uri" + assert c.token is None + # token not present + assert c.to_connector_params() == { + "authenticator": "uri", + "user": "user1", + "password": None, + "account": "host1", + "database": "db1", + "application": "dltHub_dlt", + } + c = resolve_configuration( + SnowflakeCredentials("snowflake://user1@host1/db1?authenticator=oauth&token=TOK") + ) + assert c.to_connector_params() == { + "authenticator": "oauth", + "token": "TOK", + "user": "user1", + "password": None, + "account": "host1", + "database": "db1", + "application": "dltHub_dlt", + } + + +def test_no_query(environment) -> None: + c = SnowflakeCredentials("snowflake://user1:pass1@host1/db1") + assert str(c.to_url()) == "snowflake://user1:pass1@host1/db1" + print(c.to_url()) + + +def test_query_additional_params() -> None: + c = SnowflakeCredentials("snowflake://user1:pass1@host1/db1?keep_alive=true") + assert c.to_connector_params()["keep_alive"] == "true" + + # try a typed param + with TEST_DICT_CONFIG_PROVIDER().values({"credentials": {"query": {"keep_alive": True}}}): + c = SnowflakeCredentials("snowflake://user1:pass1@host1/db1") + print(c.__is_resolved__) + assert c.is_resolved() is False + c = resolve_configuration(c) + assert c.to_connector_params()["keep_alive"] is True + # serialize to str + assert c.to_url().query["keep_alive"] == "True" + + +def test_overwrite_query_value_from_explicit() -> None: + # value specified in the query is preserved over the value set in config + c = SnowflakeCredentials("snowflake://user1@host1/db1?authenticator=uri") + c.authenticator = "oauth" + assert c.to_url().query["authenticator"] == "oauth" + assert c.to_connector_params()["authenticator"] == "oauth" + + +def test_to_connector_params_private_key() -> None: creds = SnowflakeCredentials() - creds.private_key = pkey_str # type: ignore[assignment] - creds.private_key_passphrase = "12345" # type: ignore[assignment] + creds.private_key = PKEY_PEM_STR # type: ignore[assignment] + creds.private_key_passphrase = PKEY_PASSPHRASE # type: ignore[assignment] creds.username = "user1" creds.database = "db1" creds.host = "host1" @@ -82,12 +176,9 @@ def test_to_connector_params() -> None: application=SNOWFLAKE_APPLICATION_ID, ) - # base64 encoded DER key - pkey_str = Path("./tests/common/cases/secrets/encrypted-private-key-base64").read_text("utf8") - creds = SnowflakeCredentials() - creds.private_key = pkey_str # type: ignore[assignment] - creds.private_key_passphrase = "12345" # type: ignore[assignment] + creds.private_key = PKEY_DER_STR # type: ignore[assignment] + creds.private_key_passphrase = PKEY_PASSPHRASE # type: ignore[assignment] creds.username = "user1" creds.database = "db1" creds.host = "host1" @@ -127,7 +218,8 @@ def test_snowflake_credentials_native_value(environment) -> None: ) assert c.is_resolved() assert c.password == "pass" - assert "application=dlt" in str(c.to_url()) + assert c.application == "dlt" + assert "application=dlt" not in str(c.to_url()) # # but if password is specified - it is final c = resolve_configuration( SnowflakeCredentials(), @@ -138,14 +230,16 @@ def test_snowflake_credentials_native_value(environment) -> None: # set PK via env del os.environ["CREDENTIALS__PASSWORD"] - os.environ["CREDENTIALS__PRIVATE_KEY"] = "pk" + os.environ["CREDENTIALS__PRIVATE_KEY"] = PKEY_DER_STR + os.environ["CREDENTIALS__PRIVATE_KEY_PASSPHRASE"] = PKEY_PASSPHRASE c = resolve_configuration( SnowflakeCredentials(), explicit_value="snowflake://user1@host1/db1?warehouse=warehouse1&role=role1", ) assert c.is_resolved() - assert c.private_key == "pk" - assert "application=dlt" in str(c.to_url()) + assert c.private_key == PKEY_DER_STR + assert c.private_key_passphrase == PKEY_PASSPHRASE + assert c.password is None # check with application = "" it should not be in connection string os.environ["CREDENTIALS__APPLICATION"] = "" @@ -154,7 +248,18 @@ def test_snowflake_credentials_native_value(environment) -> None: explicit_value="snowflake://user1@host1/db1?warehouse=warehouse1&role=role1", ) assert c.is_resolved() + assert c.application == "" assert "application=" not in str(c.to_url()) + conn_params = c.to_connector_params() + assert isinstance(conn_params.pop("private_key"), bytes) + assert conn_params == { + "warehouse": "warehouse1", + "role": "role1", + "user": "user1", + "password": None, + "account": "host1", + "database": "db1", + } def test_snowflake_configuration() -> None: From adde28d3869de060a9e6394ff93eabfd5bfe6c50 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Fri, 21 Jun 2024 21:29:02 +0530 Subject: [PATCH 29/61] Info on how to retrieve secrets using google secret manager (#1505) * Added how to retrieve secrets using google secret manager * Added some minor corrections to snowflake docs * Updated as per comments * Fixing linting error * small correction --------- Co-authored-by: Alena --- .../dlt-ecosystem/destinations/snowflake.md | 4 +- .../docs/walkthroughs/add_credentials.md | 90 +++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index c9d70f65fe..513c951f78 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -50,14 +50,14 @@ The instructions below assume that you use the default account setup that you ge --create database with standard settings CREATE DATABASE dlt_data; -- create new user - set your password here -CREATE USER loader WITH PASSWORD='' +CREATE USER loader WITH PASSWORD=''; -- we assign all permission to a role CREATE ROLE DLT_LOADER_ROLE; GRANT ROLE DLT_LOADER_ROLE TO USER loader; -- give database access to new role GRANT USAGE ON DATABASE dlt_data TO DLT_LOADER_ROLE; -- allow `dlt` to create new schemas -GRANT CREATE SCHEMA ON DATABASE dlt_data TO ROLE DLT_LOADER_ROLE +GRANT CREATE SCHEMA ON DATABASE dlt_data TO ROLE DLT_LOADER_ROLE; -- allow access to a warehouse named COMPUTE_WH GRANT USAGE ON WAREHOUSE COMPUTE_WH TO DLT_LOADER_ROLE; -- grant access to all future schemas and tables in the database diff --git a/docs/website/docs/walkthroughs/add_credentials.md b/docs/website/docs/walkthroughs/add_credentials.md index 586d1c2a93..5b4f241d56 100644 --- a/docs/website/docs/walkthroughs/add_credentials.md +++ b/docs/website/docs/walkthroughs/add_credentials.md @@ -74,3 +74,93 @@ DESTINATION__BIGQUERY__CREDENTIALS__PRIVATE_KEY DESTINATION__BIGQUERY__CREDENTIALS__CLIENT_EMAIL DESTINATION__BIGQUERY__LOCATION ``` + +## Retrieving credentials from Google Cloud Secret Manager +To retrieve secrets from Google Cloud Secret Manager using Python, and convert them into a dictionary format, you'll need to follow these steps. First, ensure that you have the necessary permissions to access the secrets on Google Cloud, and have the `google-cloud-secret-manager` library installed. If not, you can install it using pip: + +```sh +pip install google-cloud-secret-manager +``` + +[Google Cloud Documentation: Secret Manager client libraries.](https://cloud.google.com/secret-manager/docs/reference/libraries) + +Here's how you can retrieve secrets and convert them into a dictionary: + +1. **Set up the Secret Manager client**: Create a client that will interact with the Secret Manager API. +2. **Access the secret**: Use the client to access the secret's latest version. +3. **Convert to a dictionary**: If the secret is stored in a structured format (like JSON), parse it into a Python dictionary. + +Assume we store secrets in JSON format with name "temp-secret": +```json +{"api_token": "ghp_Kskdgf98dugjf98ghd...."} +``` + +Set `.dlt/secrets.toml` as: + +```toml +[google_secrets.credentials] +"project_id" = "" +"private_key" = "-----BEGIN PRIVATE KEY-----\n....\n-----END PRIVATE KEY-----\n" +"client_email" = "....gserviceaccount.com" +``` +or `GOOGLE_SECRETS__CREDENTIALS` to the path of your service account key file. + +Retrieve the secrets stored in the Secret Manager as follows: + +```py +import json as json_lib # Rename the json import to avoid name conflict + +import dlt +from dlt.sources.helpers import requests +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import GcpServiceAccountCredentials +from google.cloud import secretmanager + +@with_config(sections=("google_secrets",)) +def get_secret_dict(secret_id: str, credentials: GcpServiceAccountCredentials = dlt.secrets.value) -> dict: + """ + Retrieve a secret from Google Cloud Secret Manager and convert it to a dictionary. + """ + # Create the Secret Manager client with provided credentials + client = secretmanager.SecretManagerServiceClient(credentials=credentials.to_native_credentials()) + + # Build the resource name of the secret version + name = f"projects/{credentials.project_id}/secrets/{secret_id}/versions/latest" + + # Access the secret version + response = client.access_secret_version(request={"name": name}) + + # Decode the payload to a string and convert it to a dictionary + secret_string = response.payload.data.decode("UTF-8") + secret_dict = json_lib.loads(secret_string) + + return secret_dict + +# Retrieve secret data as a dictionary for use in other functions. +secret_data = get_secret_dict("temp-secret") + +# Set up the request URL and headers +url = "https://api.github.com/orgs/dlt-hub/repos" +headers = { + "Authorization": f"token {secret_data['api_token']}", # Use the API token from the secret data + "Accept": "application/vnd.github+json", # Set the Accept header for GitHub API +} + +# Make a request to the GitHub API to get the list of repositories +response = requests.get(url, headers=headers) + +# Set up the DLT pipeline +pipeline = dlt.pipeline( + pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" +) +# Run the pipeline with the data from the GitHub API response +load_info = pipeline.run(response.json()) +# Print the load information to check the results +print(load_info) +``` + +### Points to Note: + +- **Permissions**: Ensure the service account or user credentials you are using have the necessary permissions to access the Secret Manager and the specific secrets. +- **Secret format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly. +- **Google Cloud authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string. \ No newline at end of file From e2f6ada3fb0bc9ee98faa89b6bcc7dcc71bff22f Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Mon, 24 Jun 2024 11:17:07 +0200 Subject: [PATCH 30/61] Fixes incorrect type check in the snippet (#1424) --- .../website/docs/dlt-ecosystem/verified-sources/sql_database.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index de3e5f4c35..fde7a64144 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -179,7 +179,7 @@ pipeline = dlt.pipeline( def _double_as_decimal_adapter(table: sa.Table) -> None: """Return double as double, not decimals, this is mysql thing""" for column in table.columns.values(): - if isinstance(column.type, sa.Double): # type: ignore + if isinstance(column.type, sa.Float): column.type.asdecimal = False sql_alchemy_source = sql_database( From 934829bb1cc2aa31da8122bdf64bcee98c6063b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Willi=20M=C3=BCller?= Date: Mon, 24 Jun 2024 14:48:40 +0530 Subject: [PATCH 31/61] #1356 implements OAuth2 Two-legged flow (#1357) --- dlt/sources/helpers/rest_client/auth.py | 86 +++++++++- .../verified-sources/rest_api.md | 1 + .../docs/general-usage/http/rest-client.md | 54 ++++++ tests/sources/helpers/rest_client/conftest.py | 41 ++++- .../helpers/rest_client/test_client.py | 158 +++++++++++++++++- 5 files changed, 320 insertions(+), 20 deletions(-) diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 29e6d8c77a..d2ca1c1ca6 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -1,17 +1,18 @@ -from base64 import b64encode -import dataclasses import math +import dataclasses +from abc import abstractmethod +from base64 import b64encode from typing import ( - List, + TYPE_CHECKING, + Any, Dict, Final, + Iterable, + List, Literal, Optional, Union, - Any, cast, - Iterable, - TYPE_CHECKING, ) from typing_extensions import Annotated from requests.auth import AuthBase @@ -24,7 +25,6 @@ from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.pendulum import pendulum from dlt.common.typing import TSecretStrValue - from dlt.sources.helpers import requests if TYPE_CHECKING: @@ -144,6 +144,76 @@ def __call__(self, request: PreparedRequest) -> PreparedRequest: return request +@configspec +class OAuth2ClientCredentials(OAuth2AuthBase): + """ + This class implements OAuth2 Client Credentials flow where the autorization service + gives permission without the end user approving. + This is often used for machine-to-machine authorization. + The client sends its client ID and client secret to the authorization service which replies + with a temporary access token. + With the access token, the client can access resource services. + """ + + def __init__( + self, + access_token_url: TSecretStrValue, + client_id: TSecretStrValue, + client_secret: TSecretStrValue, + access_token_request_data: Dict[str, Any] = None, + default_token_expiration: int = 3600, + session: Annotated[BaseSession, NotResolved()] = None, + ) -> None: + super().__init__() + self.access_token_url = access_token_url + self.client_id = client_id + self.client_secret = client_secret + if access_token_request_data is None: + self.access_token_request_data = {} + else: + self.access_token_request_data = access_token_request_data + self.default_token_expiration = default_token_expiration + self.token_expiry: pendulum.DateTime = pendulum.now() + + self.session = session if session is not None else requests.client.session + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + if self.access_token is None or self.is_token_expired(): + self.obtain_token() + request.headers["Authorization"] = f"Bearer {self.access_token}" + return request + + def is_token_expired(self) -> bool: + return pendulum.now() >= self.token_expiry + + def obtain_token(self) -> None: + response = self.session.post(self.access_token_url, **self.build_access_token_request()) + response.raise_for_status() + response_json = response.json() + self.parse_native_representation(self.parse_access_token(response_json)) + expires_in_seconds = self.parse_expiration_in_seconds(response_json) + self.token_expiry = pendulum.now().add(seconds=expires_in_seconds) + + def build_access_token_request(self) -> Dict[str, Any]: + return { + "headers": { + "Content-Type": "application/x-www-form-urlencoded", + }, + "data": { + "client_id": self.client_id, + "client_secret": self.client_secret, + "grant_type": "client_credentials", + **self.access_token_request_data, + }, + } + + def parse_expiration_in_seconds(self, response_json: Any) -> int: + return int(response_json.get("expires_in", self.default_token_expiration)) + + def parse_access_token(self, response_json: Any) -> str: + return str(response_json.get("access_token")) + + @configspec class OAuthJWTAuth(BearerTokenAuth): """This is a form of Bearer auth, actually there's not standard way to declare it in openAPI""" @@ -164,7 +234,7 @@ def __post_init__(self) -> None: self.scopes = self.scopes if isinstance(self.scopes, str) else " ".join(self.scopes) self.token = None self.token_expiry: Optional[pendulum.DateTime] = None - # use default system session is not specified + # use default system session unless specified otherwise if self.session is None: self.session = requests.client.session diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 98725627b9..11d09c89f7 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -416,6 +416,7 @@ Available authentication types: | [BearTokenAuth](../../general-usage/http/rest-client.md#bearer-token-authentication) | `bearer` | Bearer token authentication. | | [HTTPBasicAuth](../../general-usage/http/rest-client.md#http-basic-authentication) | `http_basic` | Basic HTTP authentication. | | [APIKeyAuth](../../general-usage/http/rest-client.md#api-key-authentication) | `api_key` | API key authentication with key defined in the query parameters or in the headers. | +| [OAuth2ClientCredentials](../../general-usage/http/rest-client.md#oauth20-authorization) | N/A | OAuth 2.0 authorization with a temporary access token obtained from the authorization server. | To specify the authentication configuration, use the `auth` field in the [client](#client) configuration: diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 1093428b0f..3a7276a534 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -406,8 +406,11 @@ The available authentication methods are defined in the `dlt.sources.helpers.res - [BearerTokenAuth](#bearer-token-authentication) - [APIKeyAuth](#api-key-authentication) - [HttpBasicAuth](#http-basic-authentication) +- [OAuth2ClientCredentials](#oauth20-authorization) For specific use cases, you can [implement custom authentication](#implementing-custom-authentication) by subclassing the `AuthBase` class from the Requests library. +For specific flavors of OAuth 2.0 you can [implement custom OAuth 2.0](#oauth2-authorization) +by subclassing `OAuth2ClientCredentials`. ### Bearer token authentication @@ -477,6 +480,57 @@ client = RESTClient(base_url="https://api.example.com", auth=auth) response = client.get("/protected/resource") ``` +### OAuth 2.0 authorization + +OAuth 2.0 is a common protocol for authorization. We have implemented two-legged authorization employed for server-to-server authorization because the end user (resource owner) does not need to grant approval. +The REST client acts as the OAuth client which obtains a temporary access token from the authorization server. This access token is then sent to the resource server to access protected content. If the access token is expired, the OAuth client automatically refreshes it. + +Unfortunately, most OAuth 2.0 implementations vary and thus you might need to subclass `OAuth2ClientCredentials` and implement `build_access_token_request()` to suite the requirements of the specific authorization server you want to interact with. + +**Parameters:** +- `access_token_url`: The url to obtain the temporary access token. +- `client_id`: Client credential to obtain authorization. Usually issued via a developer portal. +- `client_secret`: Client credential to obtain authorization. Usually issued via a developer portal. +- `access_token_request_data`: A dictionary with data required by the autorization server apart from the `client_id`, `client_secret`, and `"grant_type": "client_credentials"`. Defaults to `None`. +- `default_token_expiration`: The time in seconds after which the temporary access token expires. Defaults to 3600. + +**Example:** + +```py +from base64 import b64encode +from dlt.sources.helpers.rest_client import RESTClient +from dlt.sources.helpers.rest_client.auth import OAuth2ClientCredentials + +class OAuth2ClientCredentialsHTTPBasic(OAuth2ClientCredentials): + """Used e.g. by Zoom Zoom Video Communications, Inc.""" + def build_access_token_request(self) -> Dict[str, Any]: + authentication: str = b64encode( + f"{self.client_id}:{self.client_secret}".encode() + ).decode() + return { + "headers": { + "Authorization": f"Basic {authentication}", + "Content-Type": "application/x-www-form-urlencoded", + }, + "data": self.access_token_request_data, + } + +auth = OAuth2ClientCredentialsHTTPBasic( + access_token_url=dlt.secrets["sources.zoom.access_token_url"], # "https://zoom.us/oauth/token" + client_id=dlt.secrets["sources.zoom.client_id"], + client_secret=dlt.secrets["sources.zoom.client_secret"], + access_token_request_data={ + "grant_type": "account_credentials", + "account_id": dlt.secrets["sources.zoom.account_id"], + }, +) +client = RESTClient(base_url="https://api.zoom.us/v2", auth=auth) + +response = client.get("/users") +``` + + + ### Implementing custom authentication You can implement custom authentication by subclassing the `AuthBase` class and implementing the `__call__` method: diff --git a/tests/sources/helpers/rest_client/conftest.py b/tests/sources/helpers/rest_client/conftest.py index 7453c63d14..08233bc3a8 100644 --- a/tests/sources/helpers/rest_client/conftest.py +++ b/tests/sources/helpers/rest_client/conftest.py @@ -1,8 +1,8 @@ import re -from typing import NamedTuple, Callable, Pattern, List, Union, TYPE_CHECKING, Dict, List, Any +from typing import NamedTuple, Callable, Pattern, Union, TYPE_CHECKING, Dict, List, Any import base64 -from urllib.parse import urlsplit, urlunsplit +from urllib.parse import parse_qs, urlsplit, urlunsplit import pytest import requests_mock @@ -207,7 +207,17 @@ def protected_api_key(request, context): @router.post("/oauth/token") def oauth_token(request, context): - return {"access_token": "test-token", "expires_in": 3600} + if oauth_authorize(request): + return {"access_token": "test-token", "expires_in": 3600} + context.status_code = 401 + return {"error": "Unauthorized"} + + @router.post("/oauth/token-expires-now") + def oauth_token_expires_now(request, context): + if oauth_authorize(request): + return {"access_token": "test-token", "expires_in": 0} + context.status_code = 401 + return {"error": "Unauthorized"} @router.post("/auth/refresh") def refresh_token(request, context): @@ -217,11 +227,36 @@ def refresh_token(request, context): context.status_code = 401 return {"error": "Invalid refresh token"} + @router.post("/custom-oauth/token") + def custom_oauth_token(request, context): + qs = parse_qs(request.text) + if ( + qs.get("grant_type")[0] == "account_credentials" + and qs.get("account_id")[0] == "test-account-id" + and request.headers["Authorization"] + == "Basic dGVzdC1hY2NvdW50LWlkOnRlc3QtY2xpZW50LXNlY3JldA==" + ): + return {"access_token": "test-token", "expires_in": 3600} + context.status_code = 401 + return {"error": "Unauthorized"} + router.register_routes(m) yield m +def oauth_authorize(request): + qs = parse_qs(request.text) + grant_type = qs.get("grant_type")[0] + if "jwt-bearer" in grant_type: + return True + if "client_credentials" in grant_type: + return ( + qs["client_secret"][0] == "test-client-secret" + and qs["client_id"][0] == "test-client-id" + ) + + def assert_pagination(pages, expected_start=0, page_size=10, total_pages=10): assert len(pages) == total_pages for i, page in enumerate(pages): diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index bd65affe62..7196ef3436 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -1,23 +1,28 @@ import os +from base64 import b64encode +from typing import Any, Dict, cast +from unittest.mock import patch + import pytest -from typing import Any, cast -from dlt.common import logger from requests import PreparedRequest, Request, Response from requests.auth import AuthBase +from requests.exceptions import HTTPError + +from dlt.common import logger from dlt.common.typing import TSecretStrValue from dlt.sources.helpers.requests import Client from dlt.sources.helpers.rest_client import RESTClient -from dlt.sources.helpers.rest_client.client import Hooks -from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator - -from dlt.sources.helpers.rest_client.auth import AuthConfigBase from dlt.sources.helpers.rest_client.auth import ( - BearerTokenAuth, APIKeyAuth, + AuthConfigBase, + BearerTokenAuth, HttpBasicAuth, + OAuth2ClientCredentials, OAuthJWTAuth, ) +from dlt.sources.helpers.rest_client.client import Hooks from dlt.sources.helpers.rest_client.exceptions import IgnoreResponseException +from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator from .conftest import assert_pagination @@ -31,13 +36,40 @@ def load_private_key(name="private_key.pem"): TEST_PRIVATE_KEY = load_private_key() -@pytest.fixture -def rest_client() -> RESTClient: +def build_rest_client(auth=None) -> RESTClient: return RESTClient( base_url="https://api.example.com", headers={"Accept": "application/json"}, session=Client().session, + auth=auth, + ) + + +@pytest.fixture +def rest_client() -> RESTClient: + return build_rest_client() + + +@pytest.fixture +def rest_client_oauth() -> RESTClient: + auth = OAuth2ClientCredentials( + access_token_url=cast(TSecretStrValue, "https://api.example.com/oauth/token"), + client_id=cast(TSecretStrValue, "test-client-id"), + client_secret=cast(TSecretStrValue, "test-client-secret"), + session=Client().session, ) + return build_rest_client(auth=auth) + + +@pytest.fixture +def rest_client_immediate_oauth_expiry(auth=None) -> RESTClient: + credentials_expiring_now = OAuth2ClientCredentials( + access_token_url=cast(TSecretStrValue, "https://api.example.com/oauth/token-expires-now"), + client_id=cast(TSecretStrValue, "test-client-id"), + client_secret=cast(TSecretStrValue, "test-client-secret"), + session=Client().session, + ) + return build_rest_client(auth=credentials_expiring_now) @pytest.mark.usefixtures("mock_api_server") @@ -163,6 +195,114 @@ def test_api_key_auth_success(self, rest_client: RESTClient): assert response.status_code == 200 assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + def test_oauth2_client_credentials_flow_auth_success(self, rest_client_oauth: RESTClient): + response = rest_client_oauth.get("/protected/posts/bearer-token") + + assert response.status_code == 200 + assert "test-token" in response.request.headers["Authorization"] + + pages_iter = rest_client_oauth.paginate("/protected/posts/bearer-token") + + assert_pagination(list(pages_iter)) + + def test_oauth2_client_credentials_flow_wrong_client_id(self, rest_client: RESTClient): + auth = OAuth2ClientCredentials( + access_token_url=cast(TSecretStrValue, "https://api.example.com/oauth/token"), + client_id=cast(TSecretStrValue, "invalid-client-id"), + client_secret=cast(TSecretStrValue, "test-client-secret"), + session=Client().session, + ) + + with pytest.raises(HTTPError) as e: + rest_client.get("/protected/posts/bearer-token", auth=auth) + assert e.type == HTTPError + assert e.match("401 Client Error") + + def test_oauth2_client_credentials_flow_wrong_client_secret(self, rest_client: RESTClient): + auth = OAuth2ClientCredentials( + access_token_url=cast(TSecretStrValue, "https://api.example.com/oauth/token"), + client_id=cast(TSecretStrValue, "test-client-id"), + client_secret=cast(TSecretStrValue, "invalid-client-secret"), + session=Client().session, + ) + + with pytest.raises(HTTPError) as e: + rest_client.get( + "/protected/posts/bearer-token", + auth=auth, + ) + assert e.type == HTTPError + assert e.match("401 Client Error") + + + def test_oauth_token_expired_refresh(self, rest_client_immediate_oauth_expiry: RESTClient): + rest_client = rest_client_immediate_oauth_expiry + auth = cast(OAuth2ClientCredentials, rest_client.auth) + + with patch.object(auth, "obtain_token", wraps=auth.obtain_token) as mock_obtain_token: + assert auth.access_token is None + response = rest_client.get("/protected/posts/bearer-token") + mock_obtain_token.assert_called_once() + assert response.status_code == 200 + assert auth.access_token is not None + expiry_0 = auth.token_expiry + auth.token_expiry = auth.token_expiry.subtract(seconds=1) + expiry_1 = auth.token_expiry + assert expiry_0 > expiry_1 + assert auth.is_token_expired() + + response = rest_client.get("/protected/posts/bearer-token") + assert mock_obtain_token.call_count == 2 + assert response.status_code == 200 + expiry_2 = auth.token_expiry + assert expiry_2 > expiry_1 + assert response.json()["data"][0] == {"id": 0, "title": "Post 0"} + + def test_oauth_customized_token_request(self, rest_client: RESTClient): + class OAuth2ClientCredentialsHTTPBasic(OAuth2ClientCredentials): + """OAuth 2.0 as required by e.g. Zoom Video Communications, Inc.""" + + def build_access_token_request(self) -> Dict[str, Any]: + authentication: str = b64encode( + f"{self.client_id}:{self.client_secret}".encode() + ).decode() + return { + "headers": { + "Authorization": f"Basic {authentication}", + "Content-Type": "application/x-www-form-urlencoded", + }, + "data": { + "grant_type": "account_credentials", + **self.access_token_request_data, + }, + } + + auth = OAuth2ClientCredentialsHTTPBasic( + access_token_url=cast(TSecretStrValue, "https://api.example.com/custom-oauth/token"), + client_id=cast(TSecretStrValue, "test-account-id"), + client_secret=cast(TSecretStrValue, "test-client-secret"), + access_token_request_data={ + "account_id": cast(TSecretStrValue, "test-account-id"), + }, + session=Client().session, + ) + + assert auth.build_access_token_request() == { + "headers": { + "Authorization": "Basic dGVzdC1hY2NvdW50LWlkOnRlc3QtY2xpZW50LXNlY3JldA==", + "Content-Type": "application/x-www-form-urlencoded", + }, + "data": { + "grant_type": "account_credentials", + "account_id": "test-account-id", + }, + } + + rest_client.auth = auth + pages_iter = rest_client.paginate("/protected/posts/bearer-token") + + assert_pagination(list(pages_iter)) + def test_oauth_jwt_auth_success(self, rest_client: RESTClient): auth = OAuthJWTAuth( client_id="test-client-id", From 6466ce4060d8dc70c03c76812ad806426194fb9e Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Mon, 24 Jun 2024 11:31:33 +0200 Subject: [PATCH 32/61] Shorten the installation section in README (#1500) --- README.md | 10 +--------- docs/website/docs/reference/installation.md | 2 +- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index ed1cc751c2..bc0f40b62f 100644 --- a/README.md +++ b/README.md @@ -30,20 +30,12 @@ Be it a Google Colab notebook, AWS Lambda function, an Airflow DAG, your local l dlt supports Python 3.8+. -**pip:** ```sh pip install dlt ``` -**pixi:** -```sh -pixi add dlt -``` +More options: [Install via Conda or Pixi](https://dlthub.com/docs/reference/installation#install-dlt-via-pixi-and-conda) -**conda:** -```sh -conda install -c conda-forge dlt -``` ## Quick Start diff --git a/docs/website/docs/reference/installation.md b/docs/website/docs/reference/installation.md index 3f40c3a545..a23ce82c97 100644 --- a/docs/website/docs/reference/installation.md +++ b/docs/website/docs/reference/installation.md @@ -110,7 +110,7 @@ You can install `dlt` in your virtual environment by running: pip install -U dlt ``` -## Install dlt via pixi and conda +## Install dlt via Pixi and Conda Install dlt using `pixi`: From 6b83ceec9dbac809f5e154c080c6d88b2218b381 Mon Sep 17 00:00:00 2001 From: Marcel Coetzee <34739235+Pipboyguy@users.noreply.github.com> Date: Mon, 24 Jun 2024 15:38:35 +0200 Subject: [PATCH 33/61] Add LanceDB custom destination example code (#1323) * Add LanceDB custom destination example code Signed-off-by: Marcel Coetzee * Format Signed-off-by: Marcel Coetzee * Remove Postgres credentials from example.secrets.toml Signed-off-by: Marcel Coetzee * Format Signed-off-by: Marcel Coetzee * Add typing Signed-off-by: Marcel Coetzee * Refactor code documentation and add type ignore comments Signed-off-by: Marcel Coetzee * Ignore checks Signed-off-by: Marcel Coetzee * wrap in main if statement Signed-off-by: Marcel Coetzee * Add lancedb to install dependencies in test_doc_snippets workflow Signed-off-by: Marcel Coetzee * poetry Signed-off-by: Marcel Coetzee * Update deps Signed-off-by: Marcel Coetzee * Update LanceDB version and replace Sentence-Transformers with OpenAIEmbeddings Signed-off-by: Marcel Coetzee * Poetry lock Signed-off-by: Marcel Coetzee * Format Signed-off-by: Marcel Coetzee * Update versions Signed-off-by: Marcel Coetzee * Replace OpenAI with Cohere in LanceDB custom destination example Signed-off-by: Marcel Coetzee * Format Signed-off-by: Marcel Coetzee * Add error handling to custom destination lanceDB example Signed-off-by: Marcel Coetzee * Lift config to secrets/config Signed-off-by: Marcel Coetzee * Ignore example lancedb local dir Signed-off-by: Marcel Coetzee * Why was this uncommented Signed-off-by: Marcel Coetzee * Remove unnecessary lock Signed-off-by: Marcel Coetzee * Cleanup Signed-off-by: Marcel Coetzee * Remove print statements from custom_destination_lancedb.py Signed-off-by: Marcel Coetzee * Print info Signed-off-by: Marcel Coetzee * Print info Signed-off-by: Marcel Coetzee * Use rest_client Signed-off-by: Marcel Coetzee * noqa Signed-off-by: Marcel Coetzee * Remove `cohere` dependency and add `embeddings` extra to `lancedb` Signed-off-by: Marcel Coetzee * changing secrets path for cohere to pass docs tests * fixes lock file * moves get lancedb path to run within the test * fix dependencies * fix linting * fix lancedb deps * update lock file * change source name * moved client_id to secrets * switch lancedb example to openai and small fixes * small fixes * add openai to docs deps * fix grammar gpt typing --------- Signed-off-by: Marcel Coetzee Co-authored-by: Marcin Rudolf Co-authored-by: rahuljo Co-authored-by: Dave Co-authored-by: Alena --- .../.dlt/config.toml | 2 + .../.dlt/example.secrets.toml | 7 + .../custom_destination_lancedb/.gitignore | 1 + .../custom_destination_lancedb/__init__.py | 0 .../custom_destination_lancedb.py | 155 ++++++++++++++++++ docs/tools/fix_grammar_gpt.py | 2 +- poetry.lock | 152 ++++++++++++++++- pyproject.toml | 2 + 8 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 docs/examples/custom_destination_lancedb/.dlt/config.toml create mode 100644 docs/examples/custom_destination_lancedb/.dlt/example.secrets.toml create mode 100644 docs/examples/custom_destination_lancedb/.gitignore create mode 100644 docs/examples/custom_destination_lancedb/__init__.py create mode 100644 docs/examples/custom_destination_lancedb/custom_destination_lancedb.py diff --git a/docs/examples/custom_destination_lancedb/.dlt/config.toml b/docs/examples/custom_destination_lancedb/.dlt/config.toml new file mode 100644 index 0000000000..4fd35e1159 --- /dev/null +++ b/docs/examples/custom_destination_lancedb/.dlt/config.toml @@ -0,0 +1,2 @@ +[lancedb] +db_path = "spotify.db" \ No newline at end of file diff --git a/docs/examples/custom_destination_lancedb/.dlt/example.secrets.toml b/docs/examples/custom_destination_lancedb/.dlt/example.secrets.toml new file mode 100644 index 0000000000..9c86df320c --- /dev/null +++ b/docs/examples/custom_destination_lancedb/.dlt/example.secrets.toml @@ -0,0 +1,7 @@ +[spotify] +client_id = "" +client_secret = "" + +# provide the openai api key here +[destination.lancedb.credentials] +embedding_model_provider_api_key = "" \ No newline at end of file diff --git a/docs/examples/custom_destination_lancedb/.gitignore b/docs/examples/custom_destination_lancedb/.gitignore new file mode 100644 index 0000000000..c73564481b --- /dev/null +++ b/docs/examples/custom_destination_lancedb/.gitignore @@ -0,0 +1 @@ +spotify.db \ No newline at end of file diff --git a/docs/examples/custom_destination_lancedb/__init__.py b/docs/examples/custom_destination_lancedb/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/custom_destination_lancedb/custom_destination_lancedb.py b/docs/examples/custom_destination_lancedb/custom_destination_lancedb.py new file mode 100644 index 0000000000..9d75d90f99 --- /dev/null +++ b/docs/examples/custom_destination_lancedb/custom_destination_lancedb.py @@ -0,0 +1,155 @@ +""" +--- +title: Custom Destination with LanceDB +description: Learn how use the custom destination to load to LanceDB. +keywords: [destination, credentials, example, lancedb, custom destination, vectorstore, AI, LLM] +--- + +This example showcases a Python script that demonstrates the integration of LanceDB, an open-source vector database, +as a custom destination within the dlt ecosystem. +The script illustrates the implementation of a custom destination as well as the population of the LanceDB vector +store with data from various sources. +This highlights the seamless interoperability between dlt and LanceDB. + +You can get a Spotify client ID and secret from https://developer.spotify.com/. + +We'll learn how to: +- Use the [custom destination](../dlt-ecosystem/destinations/destination.md) +- Delegate the embeddings to LanceDB using OpenAI Embeddings +""" + +__source_name__ = "spotify" + +import datetime # noqa: I251 +import os +from dataclasses import dataclass, fields +from pathlib import Path +from typing import Any + +import lancedb # type: ignore +from lancedb.embeddings import get_registry # type: ignore +from lancedb.pydantic import LanceModel, Vector # type: ignore + +import dlt +from dlt.common.configuration import configspec +from dlt.common.schema import TTableSchema +from dlt.common.typing import TDataItems, TSecretStrValue +from dlt.sources.helpers import requests +from dlt.sources.helpers.rest_client import RESTClient, AuthConfigBase + +# access secrets to get openai key and instantiate embedding function +openai_api_key: str = dlt.secrets.get("destination.lancedb.credentials.embedding_model_provider_api_key") +func = get_registry().get("openai").create(name="text-embedding-3-small", api_key=openai_api_key) + + +class EpisodeSchema(LanceModel): + id: str # noqa: A003 + name: str + description: str = func.SourceField() + vector: Vector(func.ndims()) = func.VectorField() # type: ignore[valid-type] + release_date: datetime.date + href: str + + +@dataclass(frozen=True) +class Shows: + monday_morning_data_chat: str = "3Km3lBNzJpc1nOTJUtbtMh" + latest_space_podcast: str = "2p7zZVwVF6Yk0Zsb4QmT7t" + superdatascience_podcast: str = "1n8P7ZSgfVLVJ3GegxPat1" + lex_fridman: str = "2MAi0BvDc6GTFvKFPXnkCL" + + +@configspec +class SpotifyAuth(AuthConfigBase): + client_id: str = None + client_secret: TSecretStrValue = None + + def __call__(self, request) -> Any: + if not hasattr(self, "access_token"): + self.access_token = self._get_access_token() + request.headers["Authorization"] = f"Bearer {self.access_token}" + return request + + def _get_access_token(self) -> Any: + auth_url = "https://accounts.spotify.com/api/token" + auth_response = requests.post( + auth_url, + { + "grant_type": "client_credentials", + "client_id": self.client_id, + "client_secret": self.client_secret, + }, + ) + return auth_response.json()["access_token"] + + +@dlt.source +def spotify_shows( + client_id: str = dlt.secrets.value, + client_secret: str = dlt.secrets.value, +): + spotify_base_api_url = "https://api.spotify.com/v1" + client = RESTClient( + base_url=spotify_base_api_url, + auth=SpotifyAuth(client_id=client_id, client_secret=client_secret), # type: ignore[arg-type] + ) + + for show in fields(Shows): + show_name = show.name + show_id = show.default + url = f"/shows/{show_id}/episodes" + yield dlt.resource( + client.paginate(url, params={"limit": 50}), + name=show_name, + write_disposition="merge", + primary_key="id", + parallelized=True, + max_table_nesting=0, + ) + + +@dlt.destination(batch_size=250, name="lancedb") +def lancedb_destination(items: TDataItems, table: TTableSchema) -> None: + db_path = Path(dlt.config.get("lancedb.db_path")) + db = lancedb.connect(db_path) + + # since we are embedding the description field, we need to do some additional cleaning + # for openai. Openai will not accept empty strings or input with more than 8191 tokens + for item in items: + item["description"] = item.get("description") or "No Description" + item["description"] = item["description"][0:8000] + try: + tbl = db.open_table(table["name"]) + except FileNotFoundError: + tbl = db.create_table(table["name"], schema=EpisodeSchema) + tbl.add(items) + + +if __name__ == "__main__": + db_path = Path(dlt.config.get("lancedb.db_path")) + db = lancedb.connect(db_path) + + for show in fields(Shows): + db.drop_table(show.name, ignore_missing=True) + + pipeline = dlt.pipeline( + pipeline_name="spotify", + destination=lancedb_destination, + dataset_name="spotify_podcast_data", + progress="log", + ) + + load_info = pipeline.run(spotify_shows()) + load_info.raise_on_failed_jobs() + print(load_info) + + row_counts = pipeline.last_trace.last_normalize_info + print(row_counts) + + query = "French AI scientist with Lex, talking about AGI and Meta and Llama" + table_to_query = "lex_fridman" + + tbl = db.open_table(table_to_query) + + results = tbl.search(query=query).to_list() + assert results diff --git a/docs/tools/fix_grammar_gpt.py b/docs/tools/fix_grammar_gpt.py index 065b53d470..9979a92b41 100644 --- a/docs/tools/fix_grammar_gpt.py +++ b/docs/tools/fix_grammar_gpt.py @@ -120,7 +120,7 @@ def get_chunk_length(chunk: List[str]) -> int: temperature=0, ) - fixed_chunks.append(response.choices[0].message.content) + fixed_chunks.append(response.choices[0].message.content) # type: ignore with open(file_path, "w", encoding="utf-8") as f: for c in fixed_chunks: diff --git a/poetry.lock b/poetry.lock index f6a6f98c1a..5a94993c80 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2416,6 +2416,20 @@ wrapt = ">=1.10,<2" [package.extras] dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] +[[package]] +name = "deprecation" +version = "2.1.0" +description = "A library to handle automated deprecations" +optional = false +python-versions = "*" +files = [ + {file = "deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a"}, + {file = "deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff"}, +] + +[package.dependencies] +packaging = "*" + [[package]] name = "diff-cover" version = "7.7.0" @@ -2450,6 +2464,17 @@ files = [ [package.extras] graph = ["objgraph (>=1.7.2)"] +[[package]] +name = "distro" +version = "1.9.0" +description = "Distro - an OS platform information API" +optional = false +python-versions = ">=3.6" +files = [ + {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, + {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, +] + [[package]] name = "dnspython" version = "2.4.2" @@ -4229,6 +4254,42 @@ completion = ["shtab"] docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-ruff"] +[[package]] +name = "lancedb" +version = "0.6.13" +description = "lancedb" +optional = false +python-versions = ">=3.8" +files = [ + {file = "lancedb-0.6.13-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:4667353ca7fa187e94cb0ca4c5f9577d65eb5160f6f3fe9e57902d86312c3869"}, + {file = "lancedb-0.6.13-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:2e22533fe6f6b2d7037dcdbbb4019a62402bbad4ce18395be68f4aa007bf8bc0"}, + {file = "lancedb-0.6.13-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:837eaceafb87e3ae4c261eef45c4f73715f892a36165572c3da621dbdb45afcf"}, + {file = "lancedb-0.6.13-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:61af2d72b2a2f0ea419874c3f32760fe5e51530da3be2d65251a0e6ded74419b"}, + {file = "lancedb-0.6.13-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:31b24e57ee313f4ce6255e45d42e8bee19b90ddcd13a9e07030ac04f76e7dfde"}, + {file = "lancedb-0.6.13-cp38-abi3-win_amd64.whl", hash = "sha256:b851182d8492b1e5b57a441af64c95da65ca30b045d6618dc7d203c6d60d70fa"}, +] + +[package.dependencies] +attrs = ">=21.3.0" +cachetools = "*" +deprecation = "*" +overrides = ">=0.7" +pydantic = ">=1.10" +pylance = "0.10.12" +ratelimiter = ">=1.0,<2.0" +requests = ">=2.31.0" +retry = ">=0.9.2" +semver = "*" +tqdm = ">=4.27.0" + +[package.extras] +azure = ["adlfs (>=2024.2.0)"] +clip = ["open-clip", "pillow", "torch"] +dev = ["pre-commit", "ruff"] +docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] +embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] +tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] + [[package]] name = "lazy-object-proxy" version = "1.9.0" @@ -5446,6 +5507,29 @@ packaging = "*" protobuf = "*" sympy = "*" +[[package]] +name = "openai" +version = "1.35.3" +description = "The official Python library for the openai API" +optional = false +python-versions = ">=3.7.1" +files = [ + {file = "openai-1.35.3-py3-none-any.whl", hash = "sha256:7b26544cef80f125431c073ffab3811d2421fbb9e30d3bd5c2436aba00b042d5"}, + {file = "openai-1.35.3.tar.gz", hash = "sha256:d6177087f150b381d49499be782d764213fdf638d391b29ca692b84dd675a389"}, +] + +[package.dependencies] +anyio = ">=3.5.0,<5" +distro = ">=1.7.0,<2" +httpx = ">=0.23.0,<1" +pydantic = ">=1.9.0,<3" +sniffio = "*" +tqdm = ">4" +typing-extensions = ">=4.7,<5" + +[package.extras] +datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] + [[package]] name = "openpyxl" version = "3.1.2" @@ -5659,6 +5743,17 @@ files = [ {file = "orjson-3.9.5.tar.gz", hash = "sha256:6daf5ee0b3cf530b9978cdbf71024f1c16ed4a67d05f6ec435c6e7fe7a52724c"}, ] +[[package]] +name = "overrides" +version = "7.7.0" +description = "A decorator to automatically detect mismatch when overriding a method." +optional = false +python-versions = ">=3.6" +files = [ + {file = "overrides-7.7.0-py3-none-any.whl", hash = "sha256:c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49"}, + {file = "overrides-7.7.0.tar.gz", hash = "sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a"}, +] + [[package]] name = "packaging" version = "23.1" @@ -6559,6 +6654,32 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] +[[package]] +name = "pylance" +version = "0.10.12" +description = "python wrapper for Lance columnar format" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pylance-0.10.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:30cbcca078edeb37e11ae86cf9287d81ce6c0c07ba77239284b369a4b361497b"}, + {file = "pylance-0.10.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e558163ff6035d518706cc66848497219ccc755e2972b8f3b1706a3e1fd800fd"}, + {file = "pylance-0.10.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75afb39f71d7f12429f9b4d380eb6cf6aed179ae5a1c5d16cc768373a1521f87"}, + {file = "pylance-0.10.12-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:3de391dfc3a99bdb245fd1e27ef242be769a94853f802ef57f246e9a21358d32"}, + {file = "pylance-0.10.12-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:34a5278b90f4cbcf21261353976127aa2ffbbd7d068810f0a2b0c1aa0334022a"}, + {file = "pylance-0.10.12-cp38-abi3-win_amd64.whl", hash = "sha256:6cef5975d513097fd2c22692296c9a5a138928f38d02cd34ab63a7369abc1463"}, +] + +[package.dependencies] +numpy = ">=1.22" +pyarrow = ">=12,<15.0.1" + +[package.extras] +benchmarks = ["pytest-benchmark"] +dev = ["ruff (==0.2.2)"] +ray = ["ray[data]"] +tests = ["boto3", "datasets", "duckdb", "h5py (<3.11)", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"] +torch = ["torch"] + [[package]] name = "pymongo" version = "4.6.0" @@ -7100,6 +7221,20 @@ urllib3 = ">=1.26.14,<2.0.0" [package.extras] fastembed = ["fastembed (==0.1.1)"] +[[package]] +name = "ratelimiter" +version = "1.2.0.post0" +description = "Simple python rate limiting object" +optional = false +python-versions = "*" +files = [ + {file = "ratelimiter-1.2.0.post0-py3-none-any.whl", hash = "sha256:a52be07bc0bb0b3674b4b304550f10c769bbb00fead3072e035904474259809f"}, + {file = "ratelimiter-1.2.0.post0.tar.gz", hash = "sha256:5c395dcabdbbde2e5178ef3f89b568a3066454a6ddc223b76473dac22f89b4f7"}, +] + +[package.extras] +test = ["pytest (>=3.0)", "pytest-asyncio"] + [[package]] name = "redshift-connector" version = "2.0.915" @@ -7327,6 +7462,21 @@ files = [ [package.dependencies] types-setuptools = ">=57.0.0" +[[package]] +name = "retry" +version = "0.9.2" +description = "Easy to use retry decorator." +optional = false +python-versions = "*" +files = [ + {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"}, + {file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"}, +] + +[package.dependencies] +decorator = ">=3.4.2" +py = ">=1.4.26,<2.0.0" + [[package]] name = "rfc3339-validator" version = "0.1.4" @@ -9088,4 +9238,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "47136cc3a6247e709dfe04a810df7309d1a2bc7fe838592dd5f58dc39c2407c8" +content-hash = "4ca5f4a7955437d6da09be909a729172b9a663cc0649227e6088dc1c2cd27e57" diff --git a/pyproject.toml b/pyproject.toml index 10e3bf47d5..b99c9e4051 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -214,6 +214,8 @@ pandas = ">2" alive-progress = ">=3.0.1" pyarrow = ">=14.0.0" psycopg2-binary = ">=2.9" +lancedb = ">=0.6.13" +openai = ">=1.35" [tool.black] # https://black.readthedocs.io/en/stable/usage_and_configuration/the_basics.html#configuration-via-a-file line-length = 100 From a85ea01487aacf65d78530e7a7dc0f5b484fae9e Mon Sep 17 00:00:00 2001 From: Ilya Gurov Date: Wed, 26 Jun 2024 17:21:08 +0400 Subject: [PATCH 34/61] fix(incremental): don't filter Arrow tables with empty filters (#1480) * fix(incremental): don't filter Arrow tables with empty filters * add test case * del excess import * lint fix * lint fix * tests various pk data types --------- Co-authored-by: Marcin Rudolf --- dlt/extract/incremental/transform.py | 14 +++---- .../dlt-ecosystem/verified-sources/mongodb.md | 26 +++++++++---- tests/extract/test_incremental.py | 38 +++++++++++++++++++ 3 files changed, 64 insertions(+), 14 deletions(-) diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 8b4cae4090..947e21f7b8 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -213,7 +213,7 @@ def compute_unique_values(self, item: "TAnyArrowItem", unique_columns: List[str] def compute_unique_values_with_index( self, item: "TAnyArrowItem", unique_columns: List[str] - ) -> List[Tuple[int, str]]: + ) -> List[Tuple[Any, str]]: if not unique_columns: return [] indices = item[self._dlt_index].to_pylist() @@ -318,12 +318,12 @@ def __call__( for i, uq_val in unique_values_index if uq_val in self.start_unique_hashes ] - # find rows with unique ids that were stored from previous run - remove_idx = pa.array(i for i, _ in unique_values_index) - # Filter the table - tbl = tbl.filter( - pa.compute.invert(pa.compute.is_in(tbl[self._dlt_index], remove_idx)) - ) + if len(unique_values_index) > 0: + # find rows with unique ids that were stored from previous run + remove_idx = pa.array(i for i, _ in unique_values_index) + tbl = tbl.filter( + pa.compute.invert(pa.compute.is_in(tbl[self._dlt_index], remove_idx)) + ) if ( self.last_value is None diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md index 6fda0f8fe9..f6d57a5ba2 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/mongodb.md @@ -317,16 +317,28 @@ verified source. 1. To load a selected collection and rename it in the destination: ```py - # Create the MongoDB source and select the "collection_1" collection - source = mongodb().with_resources("collection_1") + # Create the MongoDB source and select the "collection_1" collection + source = mongodb().with_resources("collection_1") - # Apply the hint to rename the table in the destination - source.resources["collection_1"].apply_hints(table_name="loaded_data_1") + # Apply the hint to rename the table in the destination + source.resources["collection_1"].apply_hints(table_name="loaded_data_1") - # Run the pipeline - info = pipeline.run(source, write_disposition="replace") - print(info) + # Run the pipeline + info = pipeline.run(source, write_disposition="replace") + print(info) ``` +1. To load a selected collection, using Apache Arrow for data conversion: + ```py + # Load collection "movies", using Apache Arrow for converion + movies = mongodb_collection( + collection="movies", + data_item_format="arrow", + ) + + # Run the pipeline + info = pipeline.run(source) + print(info) + ``` diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index bb6fb70983..49437d7b74 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -17,6 +17,7 @@ from dlt.common.configuration.specs.base_configuration import configspec, BaseConfiguration from dlt.common.configuration import ConfigurationValueError from dlt.common.pendulum import pendulum, timedelta +from dlt.common import Decimal from dlt.common.pipeline import NormalizeInfo, StateInjectableContext, resource_state from dlt.common.schema.schema import Schema from dlt.common.utils import uniq_id, digest128, chunks @@ -786,6 +787,43 @@ def some_data(first: bool, last_timestamp=dlt.sources.incremental("ts")): p.run(some_data(False)) +@pytest.mark.parametrize("item_type", set(ALL_TEST_DATA_ITEM_FORMATS) - {"pandas"}) +@pytest.mark.parametrize( + "id_value", + ("1231231231231271872", b"1231231231231271872", pendulum.now(), 1271.78, Decimal("1231.87")), +) +def test_primary_key_types(item_type: TestDataItemFormat, id_value: Any) -> None: + """Case when deduplication filter is empty for an Arrow table.""" + p = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + now = pendulum.now() + + data = [ + { + "delta": str(i), + "ts": now.add(days=i), + "_id": id_value, + } + for i in range(-10, 10) + ] + source_items = data_to_item_format(item_type, data) + start = now.add(days=-10) + + @dlt.resource + def some_data( + last_timestamp=dlt.sources.incremental("ts", initial_value=start, primary_key="_id"), + ): + yield from source_items + + info = p.run(some_data()) + info.raise_on_failed_jobs() + norm_info = p.last_trace.last_normalize_info + assert norm_info.row_counts["some_data"] == 20 + # load incrementally + info = p.run(some_data()) + norm_info = p.last_trace.last_normalize_info + assert "some_data" not in norm_info.row_counts + + @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_replace_resets_state(item_type: TestDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") From 980812e6be231af4a5365b0e72dab4a8d007c86e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Wed, 26 Jun 2024 17:24:15 +0400 Subject: [PATCH 35/61] pass credentials as keyword argument (#1499) --- dlt/pipeline/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 8dfb93b8da..392b195ff2 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1021,7 +1021,7 @@ def _get_schema_or_create(self, schema_name: str = None) -> Schema: return Schema(self.pipeline_name) def _sql_job_client(self, schema: Schema, credentials: Any = None) -> SqlJobClientBase: - client_config = self._get_destination_client_initial_config(credentials) + client_config = self._get_destination_client_initial_config(credentials=credentials) client = self._get_destination_clients(schema, client_config)[0] if isinstance(client, SqlJobClientBase): return client From 6224ff09b5523147150ceac9b8f05c710c6f7049 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Wed, 26 Jun 2024 15:46:19 +0200 Subject: [PATCH 36/61] Clear query parameters from previous request in BaseNextUrlPaginator to avoid duplicated params in the URL (#1515) --- dlt/sources/helpers/rest_client/paginators.py | 4 +++ .../helpers/rest_client/test_paginators.py | 25 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index 22cdc9b415..b6702797e9 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -420,6 +420,10 @@ def update_request(self, request: Request) -> None: request.url = self._next_reference + # Clear the query parameters from the previous request otherwise they + # will be appended to the next URL in Session.prepare_request + request.params = None + class HeaderLinkPaginator(BaseNextUrlPaginator): """A paginator that uses the 'Link' header in HTTP responses diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 9ca54e814c..e5d31c52d2 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -3,6 +3,7 @@ import pytest from requests.models import Response, Request +from requests import Session from dlt.sources.helpers.rest_client.paginators import ( SinglePagePaginator, @@ -157,6 +158,30 @@ def test_update_request(self, test_case): paginator.update_request(request) assert request.url == test_case["expected"] + def test_no_duplicate_params_on_update_request(self): + paginator = JSONResponsePaginator() + + request = Request( + method="GET", + url="http://example.com/api/resource", + params={"param1": "value1"}, + ) + + session = Session() + + response = Mock(Response, json=lambda: {"next": "/api/resource?page=2¶m1=value1"}) + paginator.update_state(response) + paginator.update_request(request) + + assert request.url == "http://example.com/api/resource?page=2¶m1=value1" + + # RESTClient._send_request() calls Session.prepare_request() which + # updates the URL with the query parameters from the request object. + prepared_request = session.prepare_request(request) + + # The next request should just use the "next" URL without any duplicate parameters. + assert prepared_request.url == "http://example.com/api/resource?page=2¶m1=value1" + class TestSinglePagePaginator: def test_update_state(self): From 3d009dc95540195aa4fab9f7921e3a47bd7b7065 Mon Sep 17 00:00:00 2001 From: Alena Astrakhantseva Date: Wed, 26 Jun 2024 16:10:21 +0200 Subject: [PATCH 37/61] Docs: remove metrics resource from stripe (#1513) * remove metrics resource * fix link * Update docs/website/docs/dlt-ecosystem/verified-sources/stripe.md * Update docs/website/docs/dlt-ecosystem/verified-sources/stripe.md --------- Co-authored-by: Anton Burnashev --- .../dlt-ecosystem/verified-sources/stripe.md | 38 ++----------------- 1 file changed, 3 insertions(+), 35 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md index 5844844cca..8c39a5090e 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/stripe.md @@ -175,24 +175,7 @@ def incremental_stripe_source( After each run, 'initial_start_date' updates to the last loaded date. Subsequent runs then retrieve only new data using append mode, streamlining the process and preventing redundant data downloads. -For more information, read the [General Usage: Incremental loading](../../general-usage/incremental-loading). - -### Resource `metrics_resource` - -This function loads a dictionary with calculated metrics, including MRR and Churn rate, along with the current timestamp. - -```py -@dlt.resource(name="Metrics", write_disposition="append", primary_key="created") -def metrics_resource() -> Iterable[TDataItem]: - ... -``` - -Abrevations MRR and Churn rate are as follows: -- Monthly Recurring Revenue (MRR): - - Measures the predictable monthly revenue from all active subscriptions. It's the sum of the monthly-normalized subscription amounts. -- Churn rate: - - Indicates the rate subscribers leave a service over a specific period. Calculated by dividing the number of recent cancellations by the total subscribers from 30 days ago, adjusted for new subscribers. - +For more information, read the [Incremental loading](../../general-usage/incremental-loading). ## Customization ### Create your own pipeline @@ -236,7 +219,7 @@ verified source. ``` > For subsequent runs, the dlt module sets the previous "end_date" as "initial_start_date", ensuring incremental data retrieval. -1. To load data created after December 31, 2022, adjust the data range for stripe_source to prevent redundant loading. For incremental_stripe_source, the initial_start_date will auto-update to the last loaded date from the previous run. +1. To load data created after December 31, 2022, adjust the data range for stripe_source to prevent redundant loading. For `incremental_stripe_source`, the `initial_start_date` will auto-update to the last loaded date from the previous run. ```py source_single = stripe_source( @@ -249,21 +232,6 @@ verified source. load_info = pipeline.run(data=[source_single, source_incremental]) print(load_info) ``` - > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](https://dlthub.com/docs/general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“full_refresh”](https://dlthub.com/docs/general-usage/pipeline#do-experiments-with-full-refresh), disrupting the metadata (state) tracking for [incremental data loading](https://dlthub.com/docs/general-usage/incremental-loading). - -1. To load important metrics and store them in database: - - ```py - # Event is an endpoint with uneditable data, so we can use 'incremental_stripe_source'. - source_event = incremental_stripe_source(endpoints=("Event",)) - # Subscription is an endpoint with editable data, use stripe_source. - source_subs = stripe_source(endpoints=("Subscription",)) - load_info = pipeline.run(data=[source_subs, source_event]) - print(load_info) - resource = metrics_resource() - print(list(resource)) - load_info = pipeline.run(resource) - print(load_info) - ``` + > To load data, maintain the pipeline name and destination dataset name. The pipeline name is vital for accessing the last run's [state](../../general-usage/state), which determines the incremental data load's end date. Altering these names can trigger a [“full_refresh”](../../general-usage/pipeline#do-experiments-with-full-refresh), disrupting the metadata (state) tracking for [incremental data loading](../../general-usage/incremental-loading). From b76f8f4130783fbd159657b71cfbf0903fa7be4a Mon Sep 17 00:00:00 2001 From: rudolfix Date: Wed, 26 Jun 2024 23:08:09 +0200 Subject: [PATCH 38/61] allows naming conventions to be changed (#998) * allows to decorate async function with dlt.source * adds pytest-async and updates pytest to 7.x * fixes forked teardown issue 7.x * bumps deps for py 3.12 * adds py 12 common tests * fixes typings after deps bump * bumps airflow, yanks duckdb to 0.9.2 * fixes tests * fixes pandas version * adds 3.12 duckdb dep * adds right hand pipe operator * fixes docker ci build * adds docs on async sources and resources * normalizes default hints and preferred types in schema * defines pipeline state table in utils, column normalization in simple regex * normalizes all identifiers used by relational normalizer, fixes other modules * fixes sql job client to use normalized identifiers in queries * runs state sync tests for lower and upper case naming conventions * fixes weaviate to use normalized identifiers in queries * partially fixes qdrant incorrect state and version retrieval queries * initial sql uppercase naming convention * adds native df readers to databricks and bigquery * adds casing identifier capability to support different casing in naming conventions, fixes how identifiers are normalized in destinations * cleans typing for relational normalizer * renames escape functions * destination capabilities for case fold and case sensitivity * drops supports naming module and allows naming to be instance in config and schema * checks all tables in information schema in one go, observes case folding and sensitivity in sql destinations * moves schema verification to destination utils * adds method to remove processing hints from schema, helper functions for schema settings, refactor, tests * accepts naming convention instances when resolving configs * fixes the cloning of schema in decorator, removes processing hints * removes processing hints when saving imported schema * adds docs on naming conventions, removes technical docs * adds casing info to databrick caps, makes caps an instance attr * adjusts destination casing in caps from schema naming and config * raises detailed schema identifier clash exceptions * adds is_case_sensitive and name to NamingConvention * adds sanity check if _dlt prefix is preserved * finds genric types in non generic classes deriving from generic * uses casefold INSERT VALUES job column names * adds a method make_qualified_table_name_path that calculates components of fully qualified table name and uses it to query INFO SCHEMA * adds casing info to destinations, caps as instance attrs, custom table name paths * adds naming convention to restore state tests, make them essential * fixes table builder tests * removes processing hints when exporting schema to import folder, warns on schema import overriding local schema, warns on processing hints present * allows to subclass INFO SCHEMA query generation and uses specialized big query override * uses correct schema escaping function in sql jobs * passes pipeline state to package state via extract * fixes optional normalizers module * excludes version_hash from pipeline state SELECT * passes pipeline state to package state pt.2 * re-enables sentry tests * bumps qdrant client, makes test running for local version * makes weaviate running * uses schemata to find databases on athena * uses api get_table for hidden dataset on bigquery to reflect schemas, support case insensitive datasets * adds naming conventions to two restore state tests * fixes escape identifiers to column escape * fix conflicts in docs * adjusts capabilities in capabilities() method, uses config and naming optionally * allows to add props to classes without vectorizer in weaviate * moves caps function into factories, cleansup adapters and custom destination * sentry_dsn * adds basic destination reference tests * fixes table builder tests * fix deps and docs * fixes more tests * case sensitivity docs stubs * fixes drop_pipeline fixture * improves partial config generation for capabilities * adds snowflake csv support * creates separate csv tests * allows to import files into extract storage, adds import file writer and spec * handles ImportFileMeta in extractor * adds import file item normalizer and router to normalize * supports csv format config for snowflake * removes realpath wherever possible and adds fast make_full_path to FileStorage * adds additional methods to load_package storage to make listings faster * adds file_format to dlt.resource, uses preferred file format for dlt state table * docs for importing files, file_format * code improvements and tests * docs hard links note * moves loader parallelism test to pipeliens, solves duckdb ci test error issue * fixes tests * moves drop_pipeline fixture level up * drops default naming convention from caps so naming in saved schema persists, allows (section, , schema) config section for schema settings * unifies all representations of pipeline state * tries to decompress text file first in fs_client * tests get stored state in test_job_client * removes credentials from dlt.attach, addes destination and staging factories * cleans up env variables and pipeline dropping fixutere precedence * removes dev_mode from dlt.attach * adds missing arguments to filesystem factory * fixes tests * updates destination and naming convention docs * removes is_case_sensitive from naming convention initializer * simplifies with_file_import mark * adds case sensitivity tests * uses dev_mode everywhere * improves csv docs * fixes encodings in fsspec * improves naming convention docs * fixes tests and renames clash to collision * fixes getting original bases from instance --- dlt/common/data_writers/__init__.py | 6 +- dlt/common/data_writers/buffered.py | 33 +- dlt/common/data_writers/configuration.py | 31 ++ dlt/common/data_writers/escape.py | 6 +- dlt/common/data_writers/exceptions.py | 10 + dlt/common/data_writers/writers.py | 80 +-- dlt/common/destination/capabilities.py | 30 +- dlt/common/destination/exceptions.py | 13 + dlt/common/destination/reference.py | 277 +++++---- dlt/common/destination/utils.py | 115 ++++ dlt/common/libs/pyarrow.py | 8 +- dlt/common/normalizers/__init__.py | 8 +- dlt/common/normalizers/configuration.py | 7 +- dlt/common/normalizers/json/relational.py | 202 ++++--- dlt/common/normalizers/naming/__init__.py | 4 +- dlt/common/normalizers/naming/direct.py | 1 - dlt/common/normalizers/naming/duck_case.py | 8 +- dlt/common/normalizers/naming/exceptions.py | 11 +- dlt/common/normalizers/naming/naming.py | 21 +- dlt/common/normalizers/naming/snake_case.py | 11 +- dlt/common/normalizers/naming/sql_ci_v1.py | 12 + dlt/common/normalizers/naming/sql_cs_v1.py | 22 + dlt/common/normalizers/typing.py | 8 +- dlt/common/normalizers/utils.py | 118 ++-- dlt/common/pipeline.py | 3 - dlt/common/schema/exceptions.py | 129 ++++- dlt/common/schema/migrations.py | 8 +- dlt/common/schema/schema.py | 457 ++++++++++----- dlt/common/schema/typing.py | 31 +- dlt/common/schema/utils.py | 161 +++++- dlt/common/storages/data_item_storage.py | 5 +- dlt/common/storages/exceptions.py | 17 + dlt/common/storages/file_storage.py | 18 +- dlt/common/storages/fsspec_filesystem.py | 1 - dlt/common/storages/live_schema_storage.py | 14 - dlt/common/storages/load_package.py | 188 +++++-- dlt/common/storages/schema_storage.py | 103 +++- dlt/common/typing.py | 36 +- dlt/common/utils.py | 47 +- dlt/common/validation.py | 3 +- dlt/destinations/adapters.py | 13 +- dlt/destinations/fs_client.py | 16 +- dlt/destinations/impl/athena/__init__.py | 33 -- dlt/destinations/impl/athena/athena.py | 47 +- dlt/destinations/impl/athena/factory.py | 41 +- dlt/destinations/impl/bigquery/__init__.py | 31 -- dlt/destinations/impl/bigquery/bigquery.py | 118 ++-- .../impl/bigquery/configuration.py | 1 + dlt/destinations/impl/bigquery/factory.py | 61 +- dlt/destinations/impl/bigquery/sql_client.py | 57 +- dlt/destinations/impl/clickhouse/__init__.py | 53 -- .../impl/clickhouse/clickhouse.py | 38 +- dlt/destinations/impl/clickhouse/factory.py | 56 +- .../impl/clickhouse/sql_client.py | 60 +- dlt/destinations/impl/databricks/__init__.py | 30 - .../impl/databricks/databricks.py | 29 +- dlt/destinations/impl/databricks/factory.py | 32 +- .../impl/databricks/sql_client.py | 46 +- dlt/destinations/impl/destination/__init__.py | 21 - .../impl/destination/configuration.py | 21 +- .../impl/destination/destination.py | 14 +- dlt/destinations/impl/destination/factory.py | 49 +- dlt/destinations/impl/dremio/__init__.py | 3 + dlt/destinations/impl/dremio/dremio.py | 54 +- dlt/destinations/impl/dremio/factory.py | 33 +- dlt/destinations/impl/dremio/sql_client.py | 36 +- dlt/destinations/impl/duckdb/__init__.py | 26 - dlt/destinations/impl/duckdb/duck.py | 16 +- dlt/destinations/impl/duckdb/factory.py | 29 +- dlt/destinations/impl/duckdb/sql_client.py | 18 +- dlt/destinations/impl/dummy/__init__.py | 39 -- dlt/destinations/impl/dummy/dummy.py | 14 +- dlt/destinations/impl/dummy/factory.py | 34 +- dlt/destinations/impl/filesystem/__init__.py | 24 - dlt/destinations/impl/filesystem/factory.py | 41 +- .../impl/filesystem/filesystem.py | 40 +- dlt/destinations/impl/filesystem/typing.py | 4 +- dlt/destinations/impl/motherduck/__init__.py | 24 - dlt/destinations/impl/motherduck/factory.py | 27 +- .../impl/motherduck/motherduck.py | 18 +- .../impl/motherduck/sql_client.py | 49 +- dlt/destinations/impl/mssql/__init__.py | 29 - dlt/destinations/impl/mssql/configuration.py | 1 + dlt/destinations/impl/mssql/factory.py | 61 +- dlt/destinations/impl/mssql/mssql.py | 23 +- dlt/destinations/impl/mssql/sql_client.py | 21 +- dlt/destinations/impl/postgres/__init__.py | 27 - .../impl/postgres/configuration.py | 6 +- dlt/destinations/impl/postgres/factory.py | 36 +- dlt/destinations/impl/postgres/postgres.py | 106 +++- dlt/destinations/impl/postgres/sql_client.py | 16 +- dlt/destinations/impl/qdrant/__init__.py | 18 - dlt/destinations/impl/qdrant/configuration.py | 4 +- dlt/destinations/impl/qdrant/factory.py | 17 +- dlt/destinations/impl/qdrant/qdrant_client.py | 159 ++++-- dlt/destinations/impl/redshift/__init__.py | 25 - .../impl/redshift/configuration.py | 2 + dlt/destinations/impl/redshift/factory.py | 50 +- dlt/destinations/impl/redshift/redshift.py | 38 +- dlt/destinations/impl/snowflake/__init__.py | 25 - .../impl/snowflake/configuration.py | 6 +- dlt/destinations/impl/snowflake/factory.py | 36 +- dlt/destinations/impl/snowflake/snowflake.py | 70 ++- dlt/destinations/impl/snowflake/sql_client.py | 17 +- dlt/destinations/impl/synapse/__init__.py | 54 -- dlt/destinations/impl/synapse/factory.py | 73 ++- dlt/destinations/impl/synapse/sql_client.py | 3 - dlt/destinations/impl/synapse/synapse.py | 20 +- dlt/destinations/impl/weaviate/__init__.py | 19 - dlt/destinations/impl/weaviate/ci_naming.py | 5 + dlt/destinations/impl/weaviate/exceptions.py | 6 +- dlt/destinations/impl/weaviate/factory.py | 23 +- dlt/destinations/impl/weaviate/naming.py | 4 + .../impl/weaviate/weaviate_client.py | 165 ++++-- dlt/destinations/insert_job_client.py | 12 +- dlt/destinations/job_client_impl.py | 290 ++++++---- dlt/destinations/sql_client.py | 61 +- dlt/destinations/sql_jobs.py | 34 +- dlt/destinations/utils.py | 127 +++++ dlt/extract/__init__.py | 3 +- dlt/extract/decorators.py | 93 +++- dlt/extract/extract.py | 16 +- dlt/extract/extractors.py | 100 +++- dlt/extract/hints.py | 12 + dlt/extract/items.py | 4 + dlt/extract/resource.py | 7 +- dlt/extract/source.py | 32 +- dlt/load/configuration.py | 5 +- dlt/load/load.py | 50 +- dlt/load/utils.py | 10 +- dlt/normalize/items_normalizers.py | 38 +- dlt/normalize/normalize.py | 243 +------- dlt/normalize/worker.py | 254 +++++++++ dlt/pipeline/__init__.py | 24 +- dlt/pipeline/dbt.py | 2 +- dlt/pipeline/mark.py | 1 + dlt/pipeline/pipeline.py | 99 ++-- dlt/pipeline/state_sync.py | 50 +- .../custom_destination_bigquery.py | 2 +- .../custom_destination_lancedb.py | 4 +- .../pdf_to_weaviate/pdf_to_weaviate.py | 2 +- .../postgres_to_postgres.py | 6 +- docs/technical/README.md | 10 - docs/technical/create_pipeline.md | 441 --------------- docs/technical/general_usage.md | 40 +- docs/technical/working_with_schemas.md | 178 ------ docs/website/blog/2023-09-05-mongo-etl.md | 2 +- docs/website/blog/2023-10-23-arrow-loading.md | 4 +- .../blog/2023-12-01-dlt-kestra-demo.md | 8 +- .../docs/dlt-ecosystem/destinations/athena.md | 4 +- .../dlt-ecosystem/destinations/bigquery.md | 2 +- .../docs/dlt-ecosystem/destinations/dremio.md | 2 +- .../docs/dlt-ecosystem/destinations/duckdb.md | 2 +- .../dlt-ecosystem/destinations/postgres.md | 22 + .../dlt-ecosystem/destinations/redshift.md | 6 + .../dlt-ecosystem/destinations/snowflake.md | 43 +- .../dlt-ecosystem/destinations/synapse.md | 2 + .../dlt-ecosystem/destinations/weaviate.md | 2 +- .../docs/dlt-ecosystem/file-formats/csv.md | 25 +- .../verified-sources/google_sheets.md | 2 +- .../verified-sources/sql_database.md | 2 +- .../website/docs/general-usage/destination.md | 88 ++- .../docs/general-usage/naming-convention.md | 113 ++++ docs/website/docs/general-usage/resource.md | 79 ++- docs/website/docs/general-usage/schema.md | 76 ++- .../performance_snippets/toml-snippets.toml | 2 +- .../walkthroughs/create-new-destination.md | 4 + docs/website/sidebars.js | 1 + poetry.lock | 527 +++++++++++++++--- pyproject.toml | 6 +- tests/common/cases/destinations/null.py | 2 +- tests/common/cases/normalizers/__init__.py | 0 tests/common/cases/normalizers/sql_upper.py | 20 + tests/common/cases/normalizers/title_case.py | 14 + tests/common/configuration/test_inject.py | 14 +- .../common/data_writers/test_data_writers.py | 23 +- tests/common/normalizers/snake_no_x.py | 10 + .../normalizers/test_import_normalizers.py | 19 +- .../normalizers/test_json_relational.py | 83 +-- tests/common/normalizers/test_naming.py | 5 +- tests/common/schema/conftest.py | 25 + tests/common/schema/test_filtering.py | 5 - tests/common/schema/test_inference.py | 26 +- tests/common/schema/test_merges.py | 29 +- .../schema/test_normalize_identifiers.py | 412 ++++++++++++++ tests/common/schema/test_schema.py | 256 +++++---- tests/common/schema/test_versioning.py | 1 - tests/common/storages/test_file_storage.py | 36 +- tests/common/storages/test_load_package.py | 178 +++++- tests/common/storages/test_load_storage.py | 12 +- tests/common/storages/test_schema_storage.py | 56 +- tests/common/storages/utils.py | 12 + tests/common/test_destination.py | 144 ++++- tests/common/utils.py | 16 +- tests/destinations/test_custom_destination.py | 89 +-- .../data_writers/test_buffered_writer.py | 21 + tests/extract/test_decorators.py | 69 ++- tests/extract/test_extract.py | 2 + tests/extract/test_extract_pipe.py | 13 + tests/extract/test_sources.py | 6 + tests/libs/pyarrow/test_pyarrow_normalizer.py | 6 +- .../athena_iceberg/test_athena_adapter.py | 4 +- .../athena_iceberg/test_athena_iceberg.py | 8 +- tests/load/bigquery/test_bigquery_client.py | 2 +- .../test_bigquery_streaming_insert.py | 10 +- .../bigquery/test_bigquery_table_builder.py | 28 +- tests/load/cases/loading/csv_header.csv | 3 + tests/load/cases/loading/csv_no_header.csv | 2 + tests/load/cases/loading/csv_no_header.csv.gz | Bin 0 -> 90 bytes tests/load/cases/loading/header.jsonl | 2 + .../clickhouse/test_clickhouse_adapter.py | 2 +- .../test_clickhouse_gcs_s3_compatibility.py | 2 +- .../test_clickhouse_table_builder.py | 4 +- tests/load/conftest.py | 4 +- .../test_databricks_configuration.py | 1 - tests/load/dremio/test_dremio_client.py | 10 +- tests/load/duckdb/test_duckdb_client.py | 5 +- .../load/duckdb/test_duckdb_table_builder.py | 5 +- tests/load/duckdb/test_motherduck_client.py | 2 +- tests/load/filesystem/test_aws_credentials.py | 8 +- .../load/filesystem/test_azure_credentials.py | 2 +- .../load/filesystem/test_filesystem_client.py | 35 ++ .../load/filesystem/test_filesystem_common.py | 9 +- .../test_object_store_rs_credentials.py | 10 +- ...entials.py => test_mssql_configuration.py} | 33 +- tests/load/mssql/test_mssql_table_builder.py | 11 +- tests/load/pipeline/conftest.py | 8 +- tests/load/pipeline/test_arrow_loading.py | 4 +- tests/load/pipeline/test_athena.py | 10 +- tests/load/pipeline/test_bigquery.py | 3 +- tests/load/pipeline/test_clickhouse.py | 7 +- tests/load/pipeline/test_csv_loading.py | 172 ++++++ tests/load/pipeline/test_dbt_helper.py | 2 +- tests/load/pipeline/test_dremio.py | 4 +- tests/load/pipeline/test_drop.py | 24 +- tests/load/pipeline/test_duckdb.py | 13 +- .../load/pipeline/test_filesystem_pipeline.py | 17 +- tests/load/pipeline/test_merge_disposition.py | 19 +- tests/load/{ => pipeline}/test_parallelism.py | 2 +- tests/load/pipeline/test_pipelines.py | 62 ++- tests/load/pipeline/test_postgres.py | 68 +-- tests/load/pipeline/test_redshift.py | 2 +- tests/load/pipeline/test_refresh_modes.py | 2 +- .../load/pipeline/test_replace_disposition.py | 4 +- tests/load/pipeline/test_restore_state.py | 112 +++- tests/load/pipeline/test_scd2.py | 19 +- .../load/pipeline/test_snowflake_pipeline.py | 55 ++ tests/load/pipeline/test_stage_loading.py | 15 +- .../test_write_disposition_changes.py | 10 +- tests/load/pipeline/utils.py | 66 --- tests/load/postgres/test_postgres_client.py | 2 +- .../postgres/test_postgres_table_builder.py | 46 +- tests/load/qdrant/test_pipeline.py | 3 + tests/load/qdrant/utils.py | 25 +- tests/load/redshift/test_redshift_client.py | 35 +- .../redshift/test_redshift_table_builder.py | 3 +- .../snowflake/test_snowflake_configuration.py | 8 +- .../snowflake/test_snowflake_table_builder.py | 4 +- .../synapse/test_synapse_configuration.py | 40 +- .../synapse/test_synapse_table_builder.py | 15 +- .../synapse/test_synapse_table_indexing.py | 8 +- tests/load/test_dummy_client.py | 62 +-- tests/load/test_insert_job_client.py | 63 ++- tests/load/test_job_client.py | 140 +++-- tests/load/test_sql_client.py | 18 +- tests/load/utils.py | 110 +++- tests/load/weaviate/test_pipeline.py | 27 +- tests/load/weaviate/test_weaviate_client.py | 42 +- tests/load/weaviate/utils.py | 105 ++-- tests/normalize/test_max_nesting.py | 4 +- tests/normalize/test_normalize.py | 34 +- tests/normalize/utils.py | 15 +- .../cases/github_pipeline/github_pipeline.py | 16 +- tests/pipeline/test_arrow_sources.py | 52 +- tests/pipeline/test_dlt_versions.py | 147 +++-- tests/pipeline/test_import_export_schema.py | 4 +- tests/pipeline/test_pipeline.py | 215 ++++++- tests/pipeline/test_pipeline_extra.py | 71 ++- tests/pipeline/test_pipeline_state.py | 59 +- tests/pipeline/utils.py | 4 +- .../helpers/rest_client/test_client.py | 1 - tests/utils.py | 2 +- 282 files changed, 7924 insertions(+), 4060 deletions(-) create mode 100644 dlt/common/data_writers/configuration.py create mode 100644 dlt/common/destination/utils.py create mode 100644 dlt/common/normalizers/naming/sql_ci_v1.py create mode 100644 dlt/common/normalizers/naming/sql_cs_v1.py create mode 100644 dlt/normalize/worker.py delete mode 100644 docs/technical/README.md delete mode 100644 docs/technical/create_pipeline.md create mode 100644 docs/website/docs/general-usage/naming-convention.md create mode 100644 tests/common/cases/normalizers/__init__.py create mode 100644 tests/common/cases/normalizers/sql_upper.py create mode 100644 tests/common/cases/normalizers/title_case.py create mode 100644 tests/common/normalizers/snake_no_x.py create mode 100644 tests/common/schema/conftest.py create mode 100644 tests/common/schema/test_normalize_identifiers.py create mode 100644 tests/load/cases/loading/csv_header.csv create mode 100644 tests/load/cases/loading/csv_no_header.csv create mode 100644 tests/load/cases/loading/csv_no_header.csv.gz create mode 100644 tests/load/cases/loading/header.jsonl rename tests/load/mssql/{test_mssql_credentials.py => test_mssql_configuration.py} (77%) create mode 100644 tests/load/pipeline/test_csv_loading.py rename tests/load/{ => pipeline}/test_parallelism.py (98%) create mode 100644 tests/load/pipeline/test_snowflake_pipeline.py diff --git a/dlt/common/data_writers/__init__.py b/dlt/common/data_writers/__init__.py index 97451d8be7..945e74a37b 100644 --- a/dlt/common/data_writers/__init__.py +++ b/dlt/common/data_writers/__init__.py @@ -3,6 +3,7 @@ DataWriterMetrics, TDataItemFormat, FileWriterSpec, + create_import_spec, resolve_best_writer_spec, get_best_writer_spec, is_native_writer, @@ -11,12 +12,13 @@ from dlt.common.data_writers.escape import ( escape_redshift_literal, escape_redshift_identifier, - escape_bigquery_identifier, + escape_hive_identifier, ) __all__ = [ "DataWriter", "FileWriterSpec", + "create_import_spec", "resolve_best_writer_spec", "get_best_writer_spec", "is_native_writer", @@ -26,5 +28,5 @@ "new_file_id", "escape_redshift_literal", "escape_redshift_identifier", - "escape_bigquery_identifier", + "escape_hive_identifier", ] diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index bd32c68c49..8077007edb 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -1,11 +1,13 @@ import gzip import time -from typing import ClassVar, List, IO, Any, Optional, Type, Generic +import contextlib +from typing import ClassVar, Iterator, List, IO, Any, Optional, Type, Generic from dlt.common.typing import TDataItem, TDataItems from dlt.common.data_writers.exceptions import ( BufferedDataWriterClosed, DestinationCapabilitiesRequired, + FileImportNotFound, InvalidFileNameTemplateException, ) from dlt.common.data_writers.writers import TWriter, DataWriter, DataWriterMetrics, FileWriterSpec @@ -138,18 +140,31 @@ def write_empty_file(self, columns: TTableSchemaColumns) -> DataWriterMetrics: self._last_modified = time.time() return self._rotate_file(allow_empty_file=True) - def import_file(self, file_path: str, metrics: DataWriterMetrics) -> DataWriterMetrics: + def import_file( + self, file_path: str, metrics: DataWriterMetrics, with_extension: str = None + ) -> DataWriterMetrics: """Import a file from `file_path` into items storage under a new file name. Does not check the imported file format. Uses counts from `metrics` as a base. Logically closes the imported file The preferred import method is a hard link to avoid copying the data. If current filesystem does not support it, a regular copy is used. + + Alternative extension may be provided via `with_extension` so various file formats may be imported into the same folder. """ # TODO: we should separate file storage from other storages. this creates circular deps from dlt.common.storages import FileStorage - self._rotate_file() - FileStorage.link_hard_with_fallback(file_path, self._file_name) + # import file with alternative extension + spec = self.writer_spec + if with_extension: + spec = self.writer_spec._replace(file_extension=with_extension) + with self.alternative_spec(spec): + self._rotate_file() + try: + FileStorage.link_hard_with_fallback(file_path, self._file_name) + except FileNotFoundError as f_ex: + raise FileImportNotFound(file_path, self._file_name) from f_ex + self._last_modified = time.time() metrics = metrics._replace( file_path=self._file_name, @@ -176,6 +191,16 @@ def close(self, skip_flush: bool = False) -> None: def closed(self) -> bool: return self._closed + @contextlib.contextmanager + def alternative_spec(self, spec: FileWriterSpec) -> Iterator[FileWriterSpec]: + """Temporarily changes the writer spec ie. for the moment file is rotated""" + old_spec = self.writer_spec + try: + self.writer_spec = spec + yield spec + finally: + self.writer_spec = old_spec + def __enter__(self) -> "BufferedDataWriter[TWriter]": return self diff --git a/dlt/common/data_writers/configuration.py b/dlt/common/data_writers/configuration.py new file mode 100644 index 0000000000..a837cb47b0 --- /dev/null +++ b/dlt/common/data_writers/configuration.py @@ -0,0 +1,31 @@ +from typing import ClassVar, Literal, Optional +from dlt.common.configuration import configspec, known_sections +from dlt.common.configuration.specs import BaseConfiguration + +CsvQuoting = Literal["quote_all", "quote_needed"] + + +@configspec +class CsvFormatConfiguration(BaseConfiguration): + delimiter: str = "," + include_header: bool = True + quoting: CsvQuoting = "quote_needed" + + # read options + on_error_continue: bool = False + encoding: str = "utf-8" + + __section__: ClassVar[str] = known_sections.DATA_WRITER + + +@configspec +class ParquetFormatConfiguration(BaseConfiguration): + flavor: Optional[str] = None # could be ie. "spark" + version: Optional[str] = "2.4" + data_page_size: Optional[int] = None + timestamp_timezone: str = "UTC" + row_group_size: Optional[int] = None + coerce_timestamps: Optional[Literal["s", "ms", "us", "ns"]] = None + allow_truncated_timestamps: bool = False + + __section__: ClassVar[str] = known_sections.DATA_WRITER diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py index 580b057716..06c8d7a95a 100644 --- a/dlt/common/data_writers/escape.py +++ b/dlt/common/data_writers/escape.py @@ -124,7 +124,7 @@ def escape_redshift_identifier(v: str) -> str: escape_dremio_identifier = escape_postgres_identifier -def escape_bigquery_identifier(v: str) -> str: +def escape_hive_identifier(v: str) -> str: # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical return "`" + v.replace("\\", "\\\\").replace("`", "\\`") + "`" @@ -132,10 +132,10 @@ def escape_bigquery_identifier(v: str) -> str: def escape_snowflake_identifier(v: str) -> str: # Snowcase uppercase all identifiers unless quoted. Match this here so queries on information schema work without issue # See also https://docs.snowflake.com/en/sql-reference/identifiers-syntax#double-quoted-identifiers - return escape_postgres_identifier(v.upper()) + return escape_postgres_identifier(v) -escape_databricks_identifier = escape_bigquery_identifier +escape_databricks_identifier = escape_hive_identifier DATABRICKS_ESCAPE_DICT = {"'": "\\'", "\\": "\\\\", "\n": "\\n", "\r": "\\r"} diff --git a/dlt/common/data_writers/exceptions.py b/dlt/common/data_writers/exceptions.py index 1d5c58f787..3b11ed70fc 100644 --- a/dlt/common/data_writers/exceptions.py +++ b/dlt/common/data_writers/exceptions.py @@ -22,6 +22,16 @@ def __init__(self, file_name: str): super().__init__(f"Writer with recent file name {file_name} is already closed") +class FileImportNotFound(DataWriterException, FileNotFoundError): + def __init__(self, import_file_path: str, local_file_path: str) -> None: + self.import_file_path = import_file_path + self.local_file_path = local_file_path + super().__init__( + f"Attempt to import non existing file {import_file_path} into extract storage file" + f" {local_file_path}" + ) + + class DestinationCapabilitiesRequired(DataWriterException, ValueError): def __init__(self, file_format: TLoaderFileFormat): self.file_format = file_format diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index 8936dae605..d324792a83 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -4,7 +4,6 @@ IO, TYPE_CHECKING, Any, - ClassVar, Dict, List, Literal, @@ -17,8 +16,7 @@ ) from dlt.common.json import json -from dlt.common.configuration import configspec, known_sections, with_config -from dlt.common.configuration.specs import BaseConfiguration +from dlt.common.configuration import with_config from dlt.common.data_writers.exceptions import ( SpecLookupFailed, DataWriterNotFound, @@ -26,15 +24,25 @@ FileSpecNotFound, InvalidDataItem, ) -from dlt.common.destination import DestinationCapabilitiesContext, TLoaderFileFormat +from dlt.common.data_writers.configuration import ( + CsvFormatConfiguration, + CsvQuoting, + ParquetFormatConfiguration, +) +from dlt.common.destination import ( + DestinationCapabilitiesContext, + TLoaderFileFormat, + ALL_SUPPORTED_FILE_FORMATS, +) from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.typing import StrAny + if TYPE_CHECKING: from dlt.common.libs.pyarrow import pyarrow as pa -TDataItemFormat = Literal["arrow", "object"] +TDataItemFormat = Literal["arrow", "object", "file"] TWriter = TypeVar("TWriter", bound="DataWriter") @@ -124,6 +132,9 @@ def item_format_from_file_extension(cls, extension: str) -> TDataItemFormat: return "object" elif extension == "parquet": return "arrow" + # those files may be imported by normalizer as is + elif extension in ALL_SUPPORTED_FILE_FORMATS: + return "file" else: raise ValueError(f"Cannot figure out data item format for extension {extension}") @@ -132,6 +143,8 @@ def writer_class_from_spec(spec: FileWriterSpec) -> Type["DataWriter"]: try: return WRITER_SPECS[spec] except KeyError: + if spec.data_item_format == "file": + return ImportFileWriter raise FileSpecNotFound(spec.file_format, spec.data_item_format, spec) @staticmethod @@ -147,6 +160,19 @@ def class_factory( raise FileFormatForItemFormatNotFound(file_format, data_item_format) +class ImportFileWriter(DataWriter): + """May only import files, fails on any open/write operations""" + + def write_header(self, columns_schema: TTableSchemaColumns) -> None: + raise NotImplementedError( + "ImportFileWriter cannot write any files. You have bug in your code." + ) + + @classmethod + def writer_spec(cls) -> FileWriterSpec: + raise NotImplementedError("ImportFileWriter has no single spec") + + class JsonlWriter(DataWriter): def write_data(self, rows: Sequence[Any]) -> None: super().write_data(rows) @@ -260,21 +286,8 @@ def writer_spec(cls) -> FileWriterSpec: ) -@configspec -class ParquetDataWriterConfiguration(BaseConfiguration): - flavor: Optional[str] = None # could be ie. "spark" - version: Optional[str] = "2.4" - data_page_size: Optional[int] = None - timestamp_timezone: str = "UTC" - row_group_size: Optional[int] = None - coerce_timestamps: Optional[Literal["s", "ms", "us", "ns"]] = None - allow_truncated_timestamps: bool = False - - __section__: ClassVar[str] = known_sections.DATA_WRITER - - class ParquetDataWriter(DataWriter): - @with_config(spec=ParquetDataWriterConfiguration) + @with_config(spec=ParquetFormatConfiguration) def __init__( self, f: IO[Any], @@ -381,20 +394,8 @@ def writer_spec(cls) -> FileWriterSpec: ) -CsvQuoting = Literal["quote_all", "quote_needed"] - - -@configspec -class CsvDataWriterConfiguration(BaseConfiguration): - delimiter: str = "," - include_header: bool = True - quoting: CsvQuoting = "quote_needed" - - __section__: ClassVar[str] = known_sections.DATA_WRITER - - class CsvWriter(DataWriter): - @with_config(spec=CsvDataWriterConfiguration) + @with_config(spec=CsvFormatConfiguration) def __init__( self, f: IO[Any], @@ -525,7 +526,7 @@ def writer_spec(cls) -> FileWriterSpec: class ArrowToCsvWriter(DataWriter): - @with_config(spec=CsvDataWriterConfiguration) + @with_config(spec=CsvFormatConfiguration) def __init__( self, f: IO[Any], @@ -783,3 +784,16 @@ def get_best_writer_spec( return DataWriter.class_factory(file_format, item_format, native_writers).writer_spec() except DataWriterNotFound: return DataWriter.class_factory(file_format, item_format, ALL_WRITERS).writer_spec() + + +def create_import_spec( + item_file_format: TLoaderFileFormat, + possible_file_formats: Sequence[TLoaderFileFormat], +) -> FileWriterSpec: + """Creates writer spec that may be used only to import files""" + # can the item file be directly imported? + if item_file_format not in possible_file_formats: + raise SpecLookupFailed("file", possible_file_formats, item_file_format) + + spec = DataWriter.class_factory(item_file_format, "object", ALL_WRITERS).writer_spec() + return spec._replace(data_item_format="file") diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index d8361d7140..f28065782a 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -8,9 +8,10 @@ Tuple, Set, Protocol, + Union, get_args, ) - +from dlt.common.typing import TLoaderFileFormat from dlt.common.configuration.utils import serialize_value from dlt.common.configuration import configspec from dlt.common.configuration.specs import ContainerInjectableContext @@ -19,19 +20,11 @@ DestinationLoadingViaStagingNotSupported, DestinationLoadingWithoutStagingNotSupported, ) -from dlt.common.utils import identity - +from dlt.common.normalizers.naming import NamingConvention from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.common.wei import EVM_DECIMAL_PRECISION -# known loader file formats -# jsonl - new line separated json documents -# typed-jsonl - internal extract -> normalize format bases on jsonl -# insert_values - insert SQL statements -# sql - any sql statement -TLoaderFileFormat = Literal["jsonl", "typed-jsonl", "insert_values", "parquet", "csv"] TLoaderParallelismStrategy = Literal["parallel", "table-sequential", "sequential"] - ALL_SUPPORTED_FILE_FORMATS: Set[TLoaderFileFormat] = set(get_args(TLoaderFileFormat)) @@ -61,9 +54,15 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): """Recommended file size in bytes when writing extract/load files""" preferred_staging_file_format: Optional[TLoaderFileFormat] = None supported_staging_file_formats: Sequence[TLoaderFileFormat] = None + format_datetime_literal: Callable[..., str] = None escape_identifier: Callable[[str], str] = None + "Escapes table name, column name and other identifiers" escape_literal: Callable[[Any], Any] = None - format_datetime_literal: Callable[..., str] = None + "Escapes string literal" + casefold_identifier: Callable[[str], str] = str + """Casing function applied by destination to represent case insensitive identifiers.""" + has_case_sensitive_identifiers: bool = None + """Tells if identifiers in destination are case sensitive, before case_identifier function is applied""" decimal_precision: Tuple[int, int] = None wei_precision: Tuple[int, int] = None max_identifier_length: int = None @@ -74,7 +73,8 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): is_max_text_data_type_length_in_bytes: bool = None supports_transactions: bool = None supports_ddl_transactions: bool = None - naming_convention: str = "snake_case" + # use naming convention in the schema + naming_convention: Union[str, NamingConvention] = None alter_add_multi_column: bool = True supports_truncate_command: bool = True schema_supports_numeric_precision: bool = True @@ -99,6 +99,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): @staticmethod def generic_capabilities( preferred_loader_file_format: TLoaderFileFormat = None, + naming_convention: Union[str, NamingConvention] = None, loader_file_format_adapter: LoaderFileFormatAdapter = None, supported_table_formats: Sequence["TTableFormat"] = None, # type: ignore[name-defined] # noqa: F821 ) -> "DestinationCapabilitiesContext": @@ -110,9 +111,12 @@ def generic_capabilities( caps.loader_file_format_adapter = loader_file_format_adapter caps.preferred_staging_file_format = None caps.supported_staging_file_formats = [] + caps.naming_convention = naming_convention or caps.naming_convention + caps.escape_identifier = str caps.supported_table_formats = supported_table_formats or [] - caps.escape_identifier = identity caps.escape_literal = serialize_value + caps.casefold_identifier = str + caps.has_case_sensitive_identifiers = True caps.format_datetime_literal = format_datetime_literal caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) caps.wei_precision = (EVM_DECIMAL_PRECISION, 0) diff --git a/dlt/common/destination/exceptions.py b/dlt/common/destination/exceptions.py index cd8f50bcce..c5f30401df 100644 --- a/dlt/common/destination/exceptions.py +++ b/dlt/common/destination/exceptions.py @@ -124,3 +124,16 @@ def __init__(self, schema_name: str, version_hash: str, stored_version_hash: str " schema in load package, you should first save it into schema storage. You can also" " use schema._bump_version() in test code to remove modified flag." ) + + +class DestinationInvalidFileFormat(DestinationTerminalException): + def __init__( + self, destination_type: str, file_format: str, file_name: str, message: str + ) -> None: + self.destination_type = destination_type + self.file_format = file_format + self.message = message + super().__init__( + f"Destination {destination_type} cannot process file {file_name} with format" + f" {file_format}: {message}" + ) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 9bb843a4c5..90f89b85d7 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -25,30 +25,27 @@ import inspect from dlt.common import logger +from dlt.common.configuration.specs.base_configuration import extract_inner_hint +from dlt.common.destination.utils import verify_schema_capabilities +from dlt.common.normalizers.naming import NamingConvention from dlt.common.schema import Schema, TTableSchema, TSchemaTables -from dlt.common.schema.typing import MERGE_STRATEGIES -from dlt.common.schema.exceptions import SchemaException from dlt.common.schema.utils import ( + get_file_format, get_write_disposition, get_table_format, - get_columns_names_with_prop, - has_column_with_prop, - get_first_column_name_with_prop, ) from dlt.common.configuration import configspec, resolve_configuration, known_sections, NotResolved from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.destination.exceptions import ( - IdentifierTooLongException, InvalidDestinationReference, UnknownDestinationModule, DestinationSchemaTampered, ) -from dlt.common.schema.utils import is_complete_column from dlt.common.schema.exceptions import UnknownTableException from dlt.common.storages import FileStorage from dlt.common.storages.load_storage import ParsedLoadJobFileName -from dlt.common.storages.load_package import LoadJobInfo +from dlt.common.storages.load_package import LoadJobInfo, TPipelineStateDoc TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"] TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") @@ -67,13 +64,23 @@ class StorageSchemaInfo(NamedTuple): schema: str -class StateInfo(NamedTuple): +@dataclasses.dataclass +class StateInfo: version: int engine_version: int pipeline_name: str state: str created_at: datetime.datetime - dlt_load_id: str = None + version_hash: Optional[str] = None + _dlt_load_id: Optional[str] = None + + def as_doc(self) -> TPipelineStateDoc: + doc: TPipelineStateDoc = dataclasses.asdict(self) # type: ignore[assignment] + if self._dlt_load_id is None: + doc.pop("_dlt_load_id") + if self.version_hash is None: + doc.pop("version_hash") + return doc @configspec @@ -98,6 +105,25 @@ def __str__(self) -> str: def on_resolved(self) -> None: self.destination_name = self.destination_name or self.destination_type + @classmethod + def credentials_type( + cls, config: "DestinationClientConfiguration" = None + ) -> Type[CredentialsConfiguration]: + """Figure out credentials type, using hint resolvers for dynamic types + + For correct type resolution of filesystem, config should have bucket_url populated + """ + key = "credentials" + type_ = cls.get_resolvable_fields()[key] + if key in cls.__hint_resolvers__ and config is not None: + try: + # Type hint for this field is created dynamically + type_ = cls.__hint_resolvers__[key](config) + except Exception: + # we suppress failed hint resolutions + pass + return extract_inner_hint(type_) + @configspec class DestinationClientDwhConfiguration(DestinationClientConfiguration): @@ -253,11 +279,15 @@ class DoNothingFollowupJob(DoNothingJob, FollowupJob): class JobClientBase(ABC): - capabilities: ClassVar[DestinationCapabilitiesContext] = None - - def __init__(self, schema: Schema, config: DestinationClientConfiguration) -> None: + def __init__( + self, + schema: Schema, + config: DestinationClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: self.schema = schema self.config = config + self.capabilities = capabilities @abstractmethod def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: @@ -315,7 +345,7 @@ def should_truncate_table_before_load(self, table: TTableSchema) -> bool: def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], - table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, + completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, ) -> List[NewLoadJob]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] @@ -336,96 +366,13 @@ def __exit__( pass def _verify_schema(self) -> None: - """Verifies and cleans up a schema before loading - - * Checks all table and column name lengths against destination capabilities and raises on too long identifiers - * Removes and warns on (unbound) incomplete columns - """ - - for table in self.schema.data_tables(): - table_name = table["name"] - if len(table_name) > self.capabilities.max_identifier_length: - raise IdentifierTooLongException( - self.config.destination_type, - "table", - table_name, - self.capabilities.max_identifier_length, - ) - if table.get("write_disposition") == "merge": - if "x-merge-strategy" in table and table["x-merge-strategy"] not in MERGE_STRATEGIES: # type: ignore[typeddict-item] - raise SchemaException( - f'"{table["x-merge-strategy"]}" is not a valid merge strategy. ' # type: ignore[typeddict-item] - f"""Allowed values: {', '.join(['"' + s + '"' for s in MERGE_STRATEGIES])}.""" - ) - if ( - table.get("x-merge-strategy") == "delete-insert" - and not has_column_with_prop(table, "primary_key") - and not has_column_with_prop(table, "merge_key") - ): - logger.warning( - f"Table {table_name} has `write_disposition` set to `merge`" - " and `merge_strategy` set to `delete-insert`, but no primary or" - " merge keys defined." - " dlt will fall back to `append` for this table." - ) - if has_column_with_prop(table, "hard_delete"): - if len(get_columns_names_with_prop(table, "hard_delete")) > 1: - raise SchemaException( - f'Found multiple "hard_delete" column hints for table "{table_name}" in' - f' schema "{self.schema.name}" while only one is allowed:' - f' {", ".join(get_columns_names_with_prop(table, "hard_delete"))}.' - ) - if table.get("write_disposition") in ("replace", "append"): - logger.warning( - f"""The "hard_delete" column hint for column "{get_first_column_name_with_prop(table, 'hard_delete')}" """ - f'in table "{table_name}" with write disposition' - f' "{table.get("write_disposition")}"' - f' in schema "{self.schema.name}" will be ignored.' - ' The "hard_delete" column hint is only applied when using' - ' the "merge" write disposition.' - ) - if has_column_with_prop(table, "dedup_sort"): - if len(get_columns_names_with_prop(table, "dedup_sort")) > 1: - raise SchemaException( - f'Found multiple "dedup_sort" column hints for table "{table_name}" in' - f' schema "{self.schema.name}" while only one is allowed:' - f' {", ".join(get_columns_names_with_prop(table, "dedup_sort"))}.' - ) - if table.get("write_disposition") in ("replace", "append"): - logger.warning( - f"""The "dedup_sort" column hint for column "{get_first_column_name_with_prop(table, 'dedup_sort')}" """ - f'in table "{table_name}" with write disposition' - f' "{table.get("write_disposition")}"' - f' in schema "{self.schema.name}" will be ignored.' - ' The "dedup_sort" column hint is only applied when using' - ' the "merge" write disposition.' - ) - if table.get("write_disposition") == "merge" and not has_column_with_prop( - table, "primary_key" - ): - logger.warning( - f"""The "dedup_sort" column hint for column "{get_first_column_name_with_prop(table, 'dedup_sort')}" """ - f'in table "{table_name}" with write disposition' - f' "{table.get("write_disposition")}"' - f' in schema "{self.schema.name}" will be ignored.' - ' The "dedup_sort" column hint is only applied when a' - " primary key has been specified." - ) - for column_name, column in dict(table["columns"]).items(): - if len(column_name) > self.capabilities.max_column_identifier_length: - raise IdentifierTooLongException( - self.config.destination_type, - "column", - f"{table_name}.{column_name}", - self.capabilities.max_column_identifier_length, - ) - if not is_complete_column(column): - logger.warning( - f"A column {column_name} in table {table_name} in schema" - f" {self.schema.name} is incomplete. It was not bound to the data during" - " normalizations stage and its data type is unknown. Did you add this" - " column manually in code ie. as a merge key?" - ) + """Verifies schema before loading""" + if exceptions := verify_schema_capabilities( + self.schema, self.capabilities, self.config.destination_type, warnings=False + ): + for exception in exceptions: + logger.error(str(exception)) + raise exceptions[0] def prepare_load_table( self, table_name: str, prepare_for_staging: bool = False @@ -438,9 +385,11 @@ def prepare_load_table( table["write_disposition"] = get_write_disposition(self.schema.tables, table_name) if "table_format" not in table: table["table_format"] = get_table_format(self.schema.tables, table_name) + if "file_format" not in table: + table["file_format"] = get_file_format(self.schema.tables, table_name) return table except KeyError: - raise UnknownTableException(table_name) + raise UnknownTableException(self.schema.name, table_name) class WithStateSync(ABC): @@ -497,7 +446,10 @@ class Destination(ABC, Generic[TDestinationConfig, TDestinationClient]): with credentials and other config params. """ - config_params: Optional[Dict[str, Any]] = None + config_params: Dict[str, Any] + """Explicit config params, overriding any injected or default values.""" + caps_params: Dict[str, Any] + """Explicit capabilities params, overriding any default values for this destination""" def __init__(self, **kwargs: Any) -> None: # Create initial unresolved destination config @@ -505,9 +457,27 @@ def __init__(self, **kwargs: Any) -> None: # to supersede config from the environment or pipeline args sig = inspect.signature(self.__class__.__init__) params = sig.parameters - self.config_params = { - k: v for k, v in kwargs.items() if k not in params or v != params[k].default - } + + # get available args + spec = self.spec + spec_fields = spec.get_resolvable_fields() + caps_fields = DestinationCapabilitiesContext.get_resolvable_fields() + + # remove default kwargs + kwargs = {k: v for k, v in kwargs.items() if k not in params or v != params[k].default} + + # warn on unknown params + for k in list(kwargs): + if k not in spec_fields and k not in caps_fields: + logger.warning( + f"When initializing destination factory of type {self.destination_type}," + f" argument {k} is not a valid field in {spec.__name__} or destination" + " capabilities" + ) + kwargs.pop(k) + + self.config_params = {k: v for k, v in kwargs.items() if k in spec_fields} + self.caps_params = {k: v for k, v in kwargs.items() if k in caps_fields} @property @abstractmethod @@ -515,9 +485,37 @@ def spec(self) -> Type[TDestinationConfig]: """A spec of destination configuration that also contains destination credentials""" ... + def capabilities( + self, config: Optional[TDestinationConfig] = None, naming: Optional[NamingConvention] = None + ) -> DestinationCapabilitiesContext: + """Destination capabilities ie. supported loader file formats, identifier name lengths, naming conventions, escape function etc. + Explicit caps arguments passed to the factory init and stored in `caps_params` are applied. + + If `config` is provided, it is used to adjust the capabilities, otherwise the explicit config composed just of `config_params` passed + to factory init is applied + If `naming` is provided, the case sensitivity and case folding are adjusted. + """ + caps = self._raw_capabilities() + caps.update(self.caps_params) + # get explicit config if final config not passed + if config is None: + # create mock credentials to avoid credentials being resolved + init_config = self.spec() + init_config.update(self.config_params) + credentials = self.spec.credentials_type(init_config)() + credentials.__is_resolved__ = True + config = self.spec(credentials=credentials) + try: + config = self.configuration(config, accept_partial=True) + except Exception: + # in rare cases partial may fail ie. when invalid native value is present + # in that case we fallback to "empty" config + pass + return self.adjust_capabilities(caps, config, naming) + @abstractmethod - def capabilities(self) -> DestinationCapabilitiesContext: - """Destination capabilities ie. supported loader file formats, identifier name lengths, naming conventions, escape function etc.""" + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + """Returns raw capabilities, before being adjusted with naming convention and config""" ... @property @@ -540,16 +538,61 @@ def client_class(self) -> Type[TDestinationClient]: """A job client class responsible for starting and resuming load jobs""" ... - def configuration(self, initial_config: TDestinationConfig) -> TDestinationConfig: + def configuration( + self, initial_config: TDestinationConfig, accept_partial: bool = False + ) -> TDestinationConfig: """Get a fully resolved destination config from the initial config""" + config = resolve_configuration( - initial_config, + initial_config or self.spec(), sections=(known_sections.DESTINATION, self.destination_name), # Already populated values will supersede resolved env config explicit_value=self.config_params, + accept_partial=accept_partial, ) return config + def client( + self, schema: Schema, initial_config: TDestinationConfig = None + ) -> TDestinationClient: + """Returns a configured instance of the destination's job client""" + config = self.configuration(initial_config) + return self.client_class(schema, config, self.capabilities(config, schema.naming)) + + @classmethod + def adjust_capabilities( + cls, + caps: DestinationCapabilitiesContext, + config: TDestinationConfig, + naming: Optional[NamingConvention], + ) -> DestinationCapabilitiesContext: + """Adjust the capabilities to match the case sensitivity as requested by naming convention.""" + # if naming not provided, skip the adjustment + if not naming or not naming.is_case_sensitive: + # all destinations are configured to be case insensitive so there's nothing to adjust + return caps + if not caps.has_case_sensitive_identifiers: + if caps.casefold_identifier is str: + logger.info( + f"Naming convention {naming.name()} is case sensitive but the destination does" + " not support case sensitive identifiers. Nevertheless identifier casing will" + " be preserved in the destination schema." + ) + else: + logger.warn( + f"Naming convention {naming.name()} is case sensitive but the destination does" + " not support case sensitive identifiers. Destination will case fold all the" + f" identifiers with {caps.casefold_identifier}" + ) + else: + # adjust case folding to store casefold identifiers in the schema + if caps.casefold_identifier is not str: + caps.casefold_identifier = str + logger.info( + f"Enabling case sensitive identifiers for naming convention {naming.name()}" + ) + return caps + @staticmethod def to_name(ref: TDestinationReferenceArg) -> str: if ref is None: @@ -562,7 +605,7 @@ def to_name(ref: TDestinationReferenceArg) -> str: @staticmethod def normalize_type(destination_type: str) -> str: - """Normalizes destination type string into a canonical form. Assumes that type names without dots correspond to build in destinations.""" + """Normalizes destination type string into a canonical form. Assumes that type names without dots correspond to built in destinations.""" if "." not in destination_type: destination_type = "dlt.destinations." + destination_type # the next two lines shorten the dlt internal destination paths to dlt.destinations. @@ -625,11 +668,5 @@ def from_reference( raise InvalidDestinationReference(ref) from e return dest - def client( - self, schema: Schema, initial_config: TDestinationConfig = None - ) -> TDestinationClient: - """Returns a configured instance of the destination's job client""" - return self.client_class(schema, self.configuration(initial_config)) - TDestination = Destination[DestinationClientConfiguration, JobClientBase] diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py new file mode 100644 index 0000000000..2c5e97df14 --- /dev/null +++ b/dlt/common/destination/utils.py @@ -0,0 +1,115 @@ +from typing import List + +from dlt.common import logger +from dlt.common.destination.exceptions import IdentifierTooLongException +from dlt.common.schema import Schema +from dlt.common.schema.exceptions import ( + SchemaIdentifierNormalizationCollision, +) +from dlt.common.schema.utils import is_complete_column +from dlt.common.typing import DictStrStr + +from .capabilities import DestinationCapabilitiesContext + + +def verify_schema_capabilities( + schema: Schema, + capabilities: DestinationCapabilitiesContext, + destination_type: str, + warnings: bool = True, +) -> List[Exception]: + """Verifies schema tables before loading against capabilities. Returns a list of exceptions representing critical problems with the schema. + It will log warnings by default. It is up to the caller to eventually raise exception + + * Checks all table and column name lengths against destination capabilities and raises on too long identifiers + * Checks if schema has collisions due to case sensitivity of the identifiers + """ + + log = logger.warning if warnings else logger.info + # collect all exceptions to show all problems in the schema + exception_log: List[Exception] = [] + # combined casing function + case_identifier = lambda ident: capabilities.casefold_identifier( + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore + ) + table_name_lookup: DictStrStr = {} + # name collision explanation + collision_msg = "Destination is case " + ( + "sensitive" if capabilities.has_case_sensitive_identifiers else "insensitive" + ) + if capabilities.casefold_identifier is not str: + collision_msg += ( + f" but it uses {capabilities.casefold_identifier} to generate case insensitive" + " identifiers. You may try to change the destination capabilities by changing the" + " `casefold_identifier` to `str`" + ) + collision_msg += ( + ". Please clean up your data before loading so the entities have different name. You can" + " also change to case insensitive naming convention. Note that in that case data from both" + " columns will be merged into one." + ) + + # check for any table clashes + for table in schema.data_tables(): + table_name = table["name"] + # detect table name conflict + cased_table_name = case_identifier(table_name) + if cased_table_name in table_name_lookup: + conflict_table_name = table_name_lookup[cased_table_name] + exception_log.append( + SchemaIdentifierNormalizationCollision( + schema.name, + table_name, + "table", + table_name, + conflict_table_name, + schema.naming.name(), + collision_msg, + ) + ) + table_name_lookup[cased_table_name] = table_name + if len(table_name) > capabilities.max_identifier_length: + exception_log.append( + IdentifierTooLongException( + destination_type, + "table", + table_name, + capabilities.max_identifier_length, + ) + ) + + column_name_lookup: DictStrStr = {} + for column_name, column in dict(table["columns"]).items(): + # detect table name conflict + cased_column_name = case_identifier(column_name) + if cased_column_name in column_name_lookup: + conflict_column_name = column_name_lookup[cased_column_name] + exception_log.append( + SchemaIdentifierNormalizationCollision( + schema.name, + table_name, + "column", + column_name, + conflict_column_name, + schema.naming.name(), + collision_msg, + ) + ) + column_name_lookup[cased_column_name] = column_name + if len(column_name) > capabilities.max_column_identifier_length: + exception_log.append( + IdentifierTooLongException( + destination_type, + "column", + f"{table_name}.{column_name}", + capabilities.max_column_identifier_length, + ) + ) + if not is_complete_column(column): + log( + f"A column {column_name} in table {table_name} in schema" + f" {schema.name} is incomplete. It was not bound to the data during" + " normalizations stage and its data type is unknown. Did you add this" + " column manually in code ie. as a merge key?" + ) + return exception_log diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 8a6dc68078..ee249b111c 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -348,13 +348,13 @@ def normalize_py_arrow_item( def get_normalized_arrow_fields_mapping(schema: pyarrow.Schema, naming: NamingConvention) -> StrStr: - """Normalizes schema field names and returns mapping from original to normalized name. Raises on name clashes""" + """Normalizes schema field names and returns mapping from original to normalized name. Raises on name collisions""" norm_f = naming.normalize_identifier name_mapping = {n.name: norm_f(n.name) for n in schema} # verify if names uniquely normalize normalized_names = set(name_mapping.values()) if len(name_mapping) != len(normalized_names): - raise NameNormalizationClash( + raise NameNormalizationCollision( f"Arrow schema fields normalized from {list(name_mapping.keys())} to" f" {list(normalized_names)}" ) @@ -497,7 +497,7 @@ def cast_arrow_schema_types( return schema -class NameNormalizationClash(ValueError): +class NameNormalizationCollision(ValueError): def __init__(self, reason: str) -> None: - msg = f"Arrow column name clash after input data normalization. {reason}" + msg = f"Arrow column name collision after input data normalization. {reason}" super().__init__(msg) diff --git a/dlt/common/normalizers/__init__.py b/dlt/common/normalizers/__init__.py index 2ff41d4c12..af6add6a19 100644 --- a/dlt/common/normalizers/__init__.py +++ b/dlt/common/normalizers/__init__.py @@ -1,11 +1,9 @@ -from dlt.common.normalizers.configuration import NormalizersConfiguration from dlt.common.normalizers.typing import TJSONNormalizer, TNormalizersConfig -from dlt.common.normalizers.utils import explicit_normalizers, import_normalizers +from dlt.common.normalizers.naming import NamingConvention + __all__ = [ - "NormalizersConfiguration", + "NamingConvention", "TJSONNormalizer", "TNormalizersConfig", - "explicit_normalizers", - "import_normalizers", ] diff --git a/dlt/common/normalizers/configuration.py b/dlt/common/normalizers/configuration.py index 54b725db1f..4e9d9c4a20 100644 --- a/dlt/common/normalizers/configuration.py +++ b/dlt/common/normalizers/configuration.py @@ -1,9 +1,9 @@ -from typing import ClassVar, Optional, TYPE_CHECKING +from typing import ClassVar, Optional, Union from dlt.common.configuration import configspec from dlt.common.configuration.specs import BaseConfiguration, known_sections from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.normalizers.typing import TJSONNormalizer +from dlt.common.normalizers.naming import NamingConvention from dlt.common.typing import DictStrAny @@ -12,8 +12,9 @@ class NormalizersConfiguration(BaseConfiguration): # always in section __section__: ClassVar[str] = known_sections.SCHEMA - naming: Optional[str] = None + naming: Optional[Union[str, NamingConvention]] = None json_normalizer: Optional[DictStrAny] = None + allow_identifier_change_on_table_with_data: Optional[bool] = None destination_capabilities: Optional[DestinationCapabilitiesContext] = None # injectable def on_resolved(self) -> None: diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index bad275ca4f..91af42a6c5 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -5,7 +5,7 @@ from dlt.common.normalizers.typing import TJSONNormalizer from dlt.common.normalizers.utils import generate_dlt_id, DLT_ID_LENGTH_BYTES -from dlt.common.typing import DictStrAny, DictStrStr, TDataItem, StrAny +from dlt.common.typing import DictStrAny, TDataItem, StrAny from dlt.common.schema import Schema from dlt.common.schema.typing import ( TColumnSchema, @@ -23,28 +23,10 @@ ) from dlt.common.validation import validate_dict -EMPTY_KEY_IDENTIFIER = "_empty" # replace empty keys with this - - -class TDataItemRow(TypedDict, total=False): - _dlt_id: str # unique id of current row - - -class TDataItemRowRoot(TDataItemRow, total=False): - _dlt_load_id: (str) # load id to identify records loaded together that ie. need to be processed - # _dlt_meta: TEventDLTMeta # stores metadata, should never be sent to the normalizer - - -class TDataItemRowChild(TDataItemRow, total=False): - _dlt_root_id: str # unique id of top level parent - _dlt_parent_id: str # unique id of parent row - _dlt_list_idx: int # position in the list of rows - value: Any # for lists of simple types - class RelationalNormalizerConfigPropagation(TypedDict, total=False): - root: Optional[Mapping[str, TColumnName]] - tables: Optional[Mapping[str, Mapping[str, TColumnName]]] + root: Optional[Dict[TColumnName, TColumnName]] + tables: Optional[Dict[str, Dict[TColumnName, TColumnName]]] class RelationalNormalizerConfig(TypedDict, total=False): @@ -54,6 +36,23 @@ class RelationalNormalizerConfig(TypedDict, total=False): class DataItemNormalizer(DataItemNormalizerBase[RelationalNormalizerConfig]): + # known normalizer props + C_DLT_ID = "_dlt_id" + """unique id of current row""" + C_DLT_LOAD_ID = "_dlt_load_id" + """load id to identify records loaded together that ie. need to be processed""" + C_DLT_ROOT_ID = "_dlt_root_id" + """unique id of top level parent""" + C_DLT_PARENT_ID = "_dlt_parent_id" + """unique id of parent row""" + C_DLT_LIST_IDX = "_dlt_list_idx" + """position in the list of rows""" + C_VALUE = "value" + """for lists of simple types""" + + # other constants + EMPTY_KEY_IDENTIFIER = "_empty" # replace empty keys with this + normalizer_config: RelationalNormalizerConfig propagation_config: RelationalNormalizerConfigPropagation max_nesting: int @@ -63,12 +62,29 @@ def __init__(self, schema: Schema) -> None: """This item normalizer works with nested dictionaries. It flattens dictionaries and descends into lists. It yields row dictionaries at each nesting level.""" self.schema = schema + self.naming = schema.naming self._reset() def _reset(self) -> None: - self.normalizer_config = ( - self.schema._normalizers_config["json"].get("config") or {} # type: ignore[assignment] + # normalize known normalizer column identifiers + self.c_dlt_id: TColumnName = TColumnName(self.naming.normalize_identifier(self.C_DLT_ID)) + self.c_dlt_load_id: TColumnName = TColumnName( + self.naming.normalize_identifier(self.C_DLT_LOAD_ID) + ) + self.c_dlt_root_id: TColumnName = TColumnName( + self.naming.normalize_identifier(self.C_DLT_ROOT_ID) + ) + self.c_dlt_parent_id: TColumnName = TColumnName( + self.naming.normalize_identifier(self.C_DLT_PARENT_ID) + ) + self.c_dlt_list_idx: TColumnName = TColumnName( + self.naming.normalize_identifier(self.C_DLT_LIST_IDX) ) + self.c_value: TColumnName = TColumnName(self.naming.normalize_identifier(self.C_VALUE)) + + # normalize config + + self.normalizer_config = self.schema._normalizers_config["json"].get("config") or {} # type: ignore[assignment] self.propagation_config = self.normalizer_config.get("propagation", None) self.max_nesting = self.normalizer_config.get("max_nesting", 1000) self._skip_primary_key = {} @@ -103,8 +119,8 @@ def _is_complex_type(self, table_name: str, field_name: str, _r_lvl: int) -> boo return data_type == "complex" def _flatten( - self, table: str, dict_row: TDataItemRow, _r_lvl: int - ) -> Tuple[TDataItemRow, Dict[Tuple[str, ...], Sequence[Any]]]: + self, table: str, dict_row: DictStrAny, _r_lvl: int + ) -> Tuple[DictStrAny, Dict[Tuple[str, ...], Sequence[Any]]]: out_rec_row: DictStrAny = {} out_rec_list: Dict[Tuple[str, ...], Sequence[Any]] = {} schema_naming = self.schema.naming @@ -115,7 +131,7 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - norm_k = schema_naming.normalize_identifier(k) else: # for empty keys in the data use _ - norm_k = EMPTY_KEY_IDENTIFIER + norm_k = self.EMPTY_KEY_IDENTIFIER # if norm_k != k: # print(f"{k} -> {norm_k}") child_name = ( @@ -139,7 +155,7 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - out_rec_row[child_name] = v norm_row_dicts(dict_row, _r_lvl) - return cast(TDataItemRow, out_rec_row), out_rec_list + return out_rec_row, out_rec_list @staticmethod def get_row_hash(row: Dict[str, Any]) -> str: @@ -160,7 +176,7 @@ def _get_child_row_hash(parent_row_id: str, child_table: str, list_idx: int) -> return digest128(f"{parent_row_id}_{child_table}_{list_idx}", DLT_ID_LENGTH_BYTES) @staticmethod - def _link_row(row: TDataItemRowChild, parent_row_id: str, list_idx: int) -> TDataItemRowChild: + def _link_row(row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: assert parent_row_id row["_dlt_parent_id"] = parent_row_id row["_dlt_list_idx"] = list_idx @@ -168,11 +184,11 @@ def _link_row(row: TDataItemRowChild, parent_row_id: str, list_idx: int) -> TDat return row @staticmethod - def _extend_row(extend: DictStrAny, row: TDataItemRow) -> None: - row.update(extend) # type: ignore + def _extend_row(extend: DictStrAny, row: DictStrAny) -> None: + row.update(extend) def _add_row_id( - self, table: str, row: TDataItemRow, parent_row_id: str, pos: int, _r_lvl: int + self, table: str, row: DictStrAny, parent_row_id: str, pos: int, _r_lvl: int ) -> str: # row_id is always random, no matter if primary_key is present or not row_id = generate_dlt_id() @@ -182,17 +198,17 @@ def _add_row_id( # child table row deterministic hash row_id = DataItemNormalizer._get_child_row_hash(parent_row_id, table, pos) # link to parent table - DataItemNormalizer._link_row(cast(TDataItemRowChild, row), parent_row_id, pos) - row["_dlt_id"] = row_id + DataItemNormalizer._link_row(row, parent_row_id, pos) + row[self.c_dlt_id] = row_id return row_id - def _get_propagated_values(self, table: str, row: TDataItemRow, _r_lvl: int) -> StrAny: + def _get_propagated_values(self, table: str, row: DictStrAny, _r_lvl: int) -> StrAny: extend: DictStrAny = {} config = self.propagation_config if config: # mapping(k:v): propagate property with name "k" as property with name "v" in child table - mappings: DictStrStr = {} + mappings: Dict[TColumnName, TColumnName] = {} if _r_lvl == 0: mappings.update(config.get("root") or {}) if table in (config.get("tables") or {}): @@ -200,7 +216,7 @@ def _get_propagated_values(self, table: str, row: TDataItemRow, _r_lvl: int) -> # look for keys and create propagation as values for prop_from, prop_as in mappings.items(): if prop_from in row: - extend[prop_as] = row[prop_from] # type: ignore + extend[prop_as] = row[prop_from] return extend @@ -214,7 +230,7 @@ def _normalize_list( parent_row_id: Optional[str] = None, _r_lvl: int = 0, ) -> TNormalizedRowIterator: - v: TDataItemRowChild = None + v: DictStrAny = None table = self.schema.naming.shorten_fragments(*parent_path, *ident_path) for idx, v in enumerate(seq): @@ -238,14 +254,14 @@ def _normalize_list( # list of simple types child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx) wrap_v = wrap_in_dict(v) - wrap_v["_dlt_id"] = child_row_hash + wrap_v[self.c_dlt_id] = child_row_hash e = DataItemNormalizer._link_row(wrap_v, parent_row_id, idx) DataItemNormalizer._extend_row(extend, e) yield (table, self.schema.naming.shorten_fragments(*parent_path)), e def _normalize_row( self, - dict_row: TDataItemRow, + dict_row: DictStrAny, extend: DictStrAny, ident_path: Tuple[str, ...], parent_path: Tuple[str, ...] = (), @@ -258,14 +274,14 @@ def _normalize_row( table = schema.naming.shorten_fragments(*parent_path, *ident_path) # compute row hash and set as row id if row_hash: - row_id = self.get_row_hash(dict_row) # type: ignore[arg-type] - dict_row["_dlt_id"] = row_id + row_id = self.get_row_hash(dict_row) + dict_row[self.c_dlt_id] = row_id # flatten current row and extract all lists to recur into flattened_row, lists = self._flatten(table, dict_row, _r_lvl) # always extend row DataItemNormalizer._extend_row(extend, flattened_row) # infer record hash or leave existing primary key if present - row_id = flattened_row.get("_dlt_id", None) + row_id = flattened_row.get(self.c_dlt_id, None) if not row_id: row_id = self._add_row_id(table, flattened_row, parent_row_id, pos, _r_lvl) @@ -292,43 +308,55 @@ def _normalize_row( ) def extend_schema(self) -> None: - # validate config + """Extends Schema with normalizer-specific hints and settings. + + This method is called by Schema when instance is created or restored from storage. + """ config = cast( RelationalNormalizerConfig, self.schema._normalizers_config["json"].get("config") or {}, ) DataItemNormalizer._validate_normalizer_config(self.schema, config) - # quick check to see if hints are applied - default_hints = self.schema.settings.get("default_hints") or {} - if "not_null" in default_hints and "^_dlt_id$" in default_hints["not_null"]: - return - # add hints - self.schema.merge_hints( + # add hints, do not compile. + self.schema._merge_hints( { "not_null": [ - TSimpleRegex("_dlt_id"), - TSimpleRegex("_dlt_root_id"), - TSimpleRegex("_dlt_parent_id"), - TSimpleRegex("_dlt_list_idx"), - TSimpleRegex("_dlt_load_id"), + TSimpleRegex(self.c_dlt_id), + TSimpleRegex(self.c_dlt_root_id), + TSimpleRegex(self.c_dlt_parent_id), + TSimpleRegex(self.c_dlt_list_idx), + TSimpleRegex(self.c_dlt_load_id), ], - "foreign_key": [TSimpleRegex("_dlt_parent_id")], - "root_key": [TSimpleRegex("_dlt_root_id")], - "unique": [TSimpleRegex("_dlt_id")], - } + "foreign_key": [TSimpleRegex(self.c_dlt_parent_id)], + "root_key": [TSimpleRegex(self.c_dlt_root_id)], + "unique": [TSimpleRegex(self.c_dlt_id)], + }, + normalize_identifiers=False, # already normalized ) for table_name in self.schema.tables.keys(): self.extend_table(table_name) def extend_table(self, table_name: str) -> None: - # if the table has a merge w_d, add propagation info to normalizer + """If the table has a merge write disposition, add propagation info to normalizer + + Called by Schema when new table is added to schema or table is updated with partial table. + Table name should be normalized. + """ table = self.schema.tables.get(table_name) if not table.get("parent") and table.get("write_disposition") == "merge": DataItemNormalizer.update_normalizer_config( self.schema, - {"propagation": {"tables": {table_name: {"_dlt_id": TColumnName("_dlt_root_id")}}}}, + { + "propagation": { + "tables": { + table_name: { + TColumnName(self.c_dlt_id): TColumnName(self.c_dlt_root_id) + } + } + } + }, ) def normalize_data_item( @@ -338,18 +366,20 @@ def normalize_data_item( if not isinstance(item, dict): item = wrap_in_dict(item) # we will extend event with all the fields necessary to load it as root row - row = cast(TDataItemRowRoot, item) + row = cast(DictStrAny, item) # identify load id if loaded data must be processed after loading incrementally - row["_dlt_load_id"] = load_id + row[self.c_dlt_load_id] = load_id + # determine if row hash should be used as dlt id row_hash = False if self._is_scd2_table(self.schema, table_name): - row_hash = self._dlt_id_is_row_hash(self.schema, table_name) + row_hash = self._dlt_id_is_row_hash(self.schema, table_name, self.c_dlt_id) self._validate_validity_column_names( - self._get_validity_column_names(self.schema, table_name), item + self.schema.name, self._get_validity_column_names(self.schema, table_name), item ) + yield from self._normalize_row( - cast(TDataItemRowChild, row), + row, {}, (self.schema.naming.normalize_table_identifier(table_name),), row_hash=row_hash, @@ -365,12 +395,12 @@ def ensure_this_normalizer(cls, norm_config: TJSONNormalizer) -> None: @classmethod def update_normalizer_config(cls, schema: Schema, config: RelationalNormalizerConfig) -> None: cls._validate_normalizer_config(schema, config) - norm_config = schema._normalizers_config["json"] - cls.ensure_this_normalizer(norm_config) - if "config" in norm_config: - update_dict_nested(norm_config["config"], config) # type: ignore + existing_config = schema._normalizers_config["json"] + cls.ensure_this_normalizer(existing_config) + if "config" in existing_config: + update_dict_nested(existing_config["config"], config) # type: ignore else: - norm_config["config"] = config + existing_config["config"] = config @classmethod def get_normalizer_config(cls, schema: Schema) -> RelationalNormalizerConfig: @@ -380,6 +410,29 @@ def get_normalizer_config(cls, schema: Schema) -> RelationalNormalizerConfig: @staticmethod def _validate_normalizer_config(schema: Schema, config: RelationalNormalizerConfig) -> None: + """Normalizes all known column identifiers according to the schema and then validates the configuration""" + + def _normalize_prop( + mapping: Mapping[TColumnName, TColumnName] + ) -> Dict[TColumnName, TColumnName]: + return { + TColumnName(schema.naming.normalize_path(from_col)): TColumnName( + schema.naming.normalize_path(to_col) + ) + for from_col, to_col in mapping.items() + } + + # normalize the identifiers first + propagation_config = config.get("propagation") + if propagation_config: + if "root" in propagation_config: + propagation_config["root"] = _normalize_prop(propagation_config["root"]) + if "tables" in propagation_config: + for table_name in propagation_config["tables"]: + propagation_config["tables"][table_name] = _normalize_prop( + propagation_config["tables"][table_name] + ) + validate_dict( RelationalNormalizerConfig, config, @@ -410,21 +463,22 @@ def _get_validity_column_names(schema: Schema, table_name: str) -> List[Optional @staticmethod @lru_cache(maxsize=None) - def _dlt_id_is_row_hash(schema: Schema, table_name: str) -> bool: + def _dlt_id_is_row_hash(schema: Schema, table_name: str, c_dlt_id: str) -> bool: return ( schema.get_table(table_name)["columns"] # type: ignore[return-value] - .get("_dlt_id", {}) + .get(c_dlt_id, {}) .get("x-row-version", False) ) @staticmethod def _validate_validity_column_names( - validity_column_names: List[Optional[str]], item: TDataItem + schema_name: str, validity_column_names: List[Optional[str]], item: TDataItem ) -> None: """Raises exception if configured validity column name appears in data item.""" for validity_column_name in validity_column_names: if validity_column_name in item.keys(): raise ColumnNameConflictException( + schema_name, "Found column in data item with same name as validity column" - f' "{validity_column_name}".' + f' "{validity_column_name}".', ) diff --git a/dlt/common/normalizers/naming/__init__.py b/dlt/common/normalizers/naming/__init__.py index 967fb9643e..2b3ecd74d0 100644 --- a/dlt/common/normalizers/naming/__init__.py +++ b/dlt/common/normalizers/naming/__init__.py @@ -1,3 +1,3 @@ -from .naming import SupportsNamingConvention, NamingConvention +from .naming import NamingConvention -__all__ = ["SupportsNamingConvention", "NamingConvention"] +__all__ = ["NamingConvention"] diff --git a/dlt/common/normalizers/naming/direct.py b/dlt/common/normalizers/naming/direct.py index 0998650852..c164e28365 100644 --- a/dlt/common/normalizers/naming/direct.py +++ b/dlt/common/normalizers/naming/direct.py @@ -5,7 +5,6 @@ class NamingConvention(BaseNamingConvention): PATH_SEPARATOR = "▶" - _CLEANUP_TABLE = str.maketrans(".\n\r'\"▶", "______") def normalize_identifier(self, identifier: str) -> str: diff --git a/dlt/common/normalizers/naming/duck_case.py b/dlt/common/normalizers/naming/duck_case.py index 063482a799..77c41e0e43 100644 --- a/dlt/common/normalizers/naming/duck_case.py +++ b/dlt/common/normalizers/naming/duck_case.py @@ -6,7 +6,13 @@ class NamingConvention(SnakeCaseNamingConvention): _CLEANUP_TABLE = str.maketrans('\n\r"', "___") - _RE_LEADING_DIGITS = None # do not remove leading digits + + def __init__(self, max_length: int = None) -> None: + """Case sensitive naming convention preserving all unicode characters except new line(s). Uses __ for path + separation and will replace multiple underscores with a single one. + """ + super().__init__(max_length) + self.is_case_sensitive = True @staticmethod @lru_cache(maxsize=None) diff --git a/dlt/common/normalizers/naming/exceptions.py b/dlt/common/normalizers/naming/exceptions.py index 572fc7e0d0..d8448fa1e0 100644 --- a/dlt/common/normalizers/naming/exceptions.py +++ b/dlt/common/normalizers/naming/exceptions.py @@ -5,21 +5,22 @@ class NormalizersException(DltException): pass -class UnknownNamingModule(NormalizersException): +class UnknownNamingModule(ImportError, NormalizersException): def __init__(self, naming_module: str) -> None: self.naming_module = naming_module if "." in naming_module: msg = f"Naming module {naming_module} could not be found and imported" else: - msg = f"Naming module {naming_module} is not one of the standard dlt naming convention" + msg = f"Naming module {naming_module} is not one of the standard dlt naming conventions" super().__init__(msg) class InvalidNamingModule(NormalizersException): - def __init__(self, naming_module: str) -> None: + def __init__(self, naming_module: str, naming_class: str) -> None: self.naming_module = naming_module + self.naming_class = naming_class msg = ( - f"Naming module {naming_module} does not implement required SupportsNamingConvention" - " protocol" + f"In naming module '{naming_module}' the class '{naming_class}' is not a" + " NamingConvention" ) super().__init__(msg) diff --git a/dlt/common/normalizers/naming/naming.py b/dlt/common/normalizers/naming/naming.py index fccb147981..b806f11eec 100644 --- a/dlt/common/normalizers/naming/naming.py +++ b/dlt/common/normalizers/naming/naming.py @@ -3,7 +3,7 @@ from functools import lru_cache import math import hashlib -from typing import Any, List, Protocol, Sequence, Type +from typing import Sequence class NamingConvention(ABC): @@ -11,7 +11,11 @@ class NamingConvention(ABC): _DEFAULT_COLLISION_PROB = 0.001 def __init__(self, max_length: int = None) -> None: + """Initializes naming convention to generate identifier with `max_length` if specified. Base naming convention + is case sensitive by default + """ self.max_length = max_length + self.is_case_sensitive = True @abstractmethod def normalize_identifier(self, identifier: str) -> str: @@ -58,6 +62,14 @@ def shorten_fragments(self, *normalized_idents: str) -> str: path_str = self.make_path(*normalized_idents) return self.shorten_identifier(path_str, path_str, self.max_length) + @classmethod + def name(cls) -> str: + """Naming convention name is the name of the module in which NamingConvention is defined""" + if cls.__module__.startswith("dlt.common.normalizers.naming."): + # return last component + return cls.__module__.split(".")[-1] + return cls.__module__ + @staticmethod @lru_cache(maxsize=None) def shorten_identifier( @@ -100,10 +112,3 @@ def _trim_and_tag(identifier: str, tag: str, max_length: int) -> str: ) assert len(identifier) == max_length return identifier - - -class SupportsNamingConvention(Protocol): - """Expected of modules defining naming convention""" - - NamingConvention: Type[NamingConvention] - """A class with a name NamingConvention deriving from normalizers.naming.NamingConvention""" diff --git a/dlt/common/normalizers/naming/snake_case.py b/dlt/common/normalizers/naming/snake_case.py index b3c65e9b8d..7ff9259745 100644 --- a/dlt/common/normalizers/naming/snake_case.py +++ b/dlt/common/normalizers/naming/snake_case.py @@ -1,5 +1,5 @@ import re -from typing import Any, List, Sequence +from typing import Sequence from functools import lru_cache from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention @@ -18,6 +18,13 @@ class NamingConvention(BaseNamingConvention): # subsequent nested fields will be separated with the string below, applies both to field and table names PATH_SEPARATOR = "__" + def __init__(self, max_length: int = None) -> None: + """Case insensitive naming convention, converting source identifiers into snake case. Uses __ as path separator. + Multiple underscores are contracted to one. + """ + super().__init__(max_length) + self.is_case_sensitive = False + def normalize_identifier(self, identifier: str) -> str: identifier = super().normalize_identifier(identifier) # print(f"{identifier} -> {self.shorten_identifier(identifier, self.max_length)} ({self.max_length})") @@ -59,5 +66,5 @@ def _to_snake_case(cls, identifier: str) -> str: stripped_ident += "x" * strip_count # identifier = cls._RE_ENDING_UNDERSCORES.sub("x", identifier) - # replace consecutive underscores with single one to prevent name clashes with PATH_SEPARATOR + # replace consecutive underscores with single one to prevent name collisions with PATH_SEPARATOR return cls._RE_UNDERSCORES.sub("_", stripped_ident) diff --git a/dlt/common/normalizers/naming/sql_ci_v1.py b/dlt/common/normalizers/naming/sql_ci_v1.py new file mode 100644 index 0000000000..baabb7ecf7 --- /dev/null +++ b/dlt/common/normalizers/naming/sql_ci_v1.py @@ -0,0 +1,12 @@ +from dlt.common.normalizers.naming.sql_cs_v1 import NamingConvention as SqlCsNamingConvention + + +class NamingConvention(SqlCsNamingConvention): + def __init__(self, max_length: int = None) -> None: + """A variant of sql_cs which lower cases all identifiers.""" + + super().__init__(max_length) + self.is_case_sensitive = False + + def normalize_identifier(self, identifier: str) -> str: + return super().normalize_identifier(identifier).lower() diff --git a/dlt/common/normalizers/naming/sql_cs_v1.py b/dlt/common/normalizers/naming/sql_cs_v1.py new file mode 100644 index 0000000000..93b93bbc89 --- /dev/null +++ b/dlt/common/normalizers/naming/sql_cs_v1.py @@ -0,0 +1,22 @@ +from typing import Any, Sequence + +from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention + +# TODO: not yet finished + + +class NamingConvention(BaseNamingConvention): + PATH_SEPARATOR = "__" + + _CLEANUP_TABLE = str.maketrans(".\n\r'\"▶", "______") + + def normalize_identifier(self, identifier: str) -> str: + identifier = super().normalize_identifier(identifier) + norm_identifier = identifier.translate(self._CLEANUP_TABLE) + return self.shorten_identifier(norm_identifier, identifier, self.max_length) + + def make_path(self, *identifiers: Any) -> str: + return self.PATH_SEPARATOR.join(filter(lambda x: x.strip(), identifiers)) + + def break_path(self, path: str) -> Sequence[str]: + return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] diff --git a/dlt/common/normalizers/typing.py b/dlt/common/normalizers/typing.py index 599426259f..3903858091 100644 --- a/dlt/common/normalizers/typing.py +++ b/dlt/common/normalizers/typing.py @@ -1,14 +1,16 @@ -from typing import List, Optional, TypedDict +from typing import List, Optional, TypedDict, Union from dlt.common.typing import StrAny +from dlt.common.normalizers.naming import NamingConvention class TJSONNormalizer(TypedDict, total=False): module: str - config: Optional[StrAny] # config is a free form and is consumed by `module` + config: Optional[StrAny] # config is a free form and is validated by `module` class TNormalizersConfig(TypedDict, total=False): - names: str + names: Union[str, NamingConvention] + allow_identifier_change_on_table_with_data: Optional[bool] detections: Optional[List[str]] json: TJSONNormalizer diff --git a/dlt/common/normalizers/utils.py b/dlt/common/normalizers/utils.py index 645bad2bea..49751980ff 100644 --- a/dlt/common/normalizers/utils.py +++ b/dlt/common/normalizers/utils.py @@ -1,12 +1,14 @@ +import inspect from importlib import import_module -from typing import Any, Type, Tuple, cast, List +from typing import Any, Dict, Optional, Type, Tuple, Union, cast, List import dlt from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import known_sections from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.normalizers.configuration import NormalizersConfiguration from dlt.common.normalizers.json import SupportsDataItemNormalizer, DataItemNormalizer -from dlt.common.normalizers.naming import NamingConvention, SupportsNamingConvention +from dlt.common.normalizers.naming import NamingConvention from dlt.common.normalizers.naming.exceptions import UnknownNamingModule, InvalidNamingModule from dlt.common.normalizers.typing import TJSONNormalizer, TNormalizersConfig from dlt.common.utils import uniq_id_base64, many_uniq_ids_base64 @@ -15,12 +17,31 @@ DLT_ID_LENGTH_BYTES = 10 -@with_config(spec=NormalizersConfiguration) +def _section_for_schema(kwargs: Dict[str, Any]) -> Tuple[str, ...]: + """Uses the schema name to generate dynamic section normalizer settings""" + if schema_name := kwargs.get("schema_name"): + return (known_sections.SOURCES, schema_name) + else: + return (known_sections.SOURCES,) + + +@with_config(spec=NormalizersConfiguration, sections=_section_for_schema) # type: ignore[call-overload] def explicit_normalizers( - naming: str = dlt.config.value, json_normalizer: TJSONNormalizer = dlt.config.value + naming: Union[str, NamingConvention] = dlt.config.value, + json_normalizer: TJSONNormalizer = dlt.config.value, + allow_identifier_change_on_table_with_data: bool = None, + schema_name: Optional[str] = None, ) -> TNormalizersConfig: - """Gets explicitly configured normalizers - via config or destination caps. May return None as naming or normalizer""" - return {"names": naming, "json": json_normalizer} + """Gets explicitly configured normalizers - via config or destination caps. May return None as naming or normalizer + + If `schema_name` is present, a section ("sources", schema_name, "schema") is used to inject the config + """ + norm_conf: TNormalizersConfig = {"names": naming, "json": json_normalizer} + if allow_identifier_change_on_table_with_data is not None: + norm_conf["allow_identifier_change_on_table_with_data"] = ( + allow_identifier_change_on_table_with_data + ) + return norm_conf @with_config @@ -34,44 +55,71 @@ def import_normalizers( """ # add defaults to normalizer_config normalizers_config["names"] = names = normalizers_config["names"] or "snake_case" - # set default json normalizer module normalizers_config["json"] = item_normalizer = normalizers_config.get("json") or {} - if "module" not in item_normalizer: - item_normalizer["module"] = "dlt.common.normalizers.json.relational" - - try: - if "." in names: - # TODO: bump schema engine version and migrate schema. also change the name in TNormalizersConfig from names to naming - if names == "dlt.common.normalizers.names.snake_case": - names = DEFAULT_NAMING_MODULE - # this is full module name - naming_module = cast(SupportsNamingConvention, import_module(names)) - else: - # from known location - naming_module = cast( - SupportsNamingConvention, import_module(f"dlt.common.normalizers.naming.{names}") - ) - except ImportError: - raise UnknownNamingModule(names) - if not hasattr(naming_module, "NamingConvention"): - raise InvalidNamingModule(names) - # get max identifier length - if destination_capabilities: - max_length = min( - destination_capabilities.max_identifier_length, - destination_capabilities.max_column_identifier_length, - ) - else: - max_length = None + item_normalizer.setdefault("module", "dlt.common.normalizers.json.relational") json_module = cast(SupportsDataItemNormalizer, import_module(item_normalizer["module"])) return ( normalizers_config, - naming_module.NamingConvention(max_length), + naming_from_reference(names, destination_capabilities), json_module.DataItemNormalizer, ) +def naming_from_reference( + names: Union[str, NamingConvention], + destination_capabilities: DestinationCapabilitiesContext = None, +) -> NamingConvention: + """Resolves naming convention from reference in `names` and applies max length from `destination_capabilities` + + Reference may be: (1) actual instance of NamingConvention (2) shorthand name pointing to `dlt.common.normalizers.naming` namespace + (3) a type name which is a module containing `NamingConvention` attribute (4) a type of class deriving from NamingConvention + """ + + def _import_naming(module: str, cls: str) -> Type[NamingConvention]: + if "." in module or cls != "NamingConvention": + # TODO: bump schema engine version and migrate schema. also change the name in TNormalizersConfig from names to naming + if module == "dlt.common.normalizers.names.snake_case": + module = DEFAULT_NAMING_MODULE + # this is full module name + naming_module = import_module(module) + else: + # from known location + naming_module = import_module(f"dlt.common.normalizers.naming.{module}") + class_ = getattr(naming_module, cls, None) + if class_ is None: + raise UnknownNamingModule(module + "." + cls) + if inspect.isclass(class_) and issubclass(class_, NamingConvention): + return class_ + raise InvalidNamingModule(module, cls) + + if not isinstance(names, NamingConvention): + try: + class_ = _import_naming(names, "NamingConvention") + except ImportError: + parts = names.rsplit(".", 1) + # we have no more options to try + if len(parts) <= 1: + raise UnknownNamingModule(names) + try: + class_ = _import_naming(*parts) + except UnknownNamingModule: + raise + except ImportError: + raise UnknownNamingModule(names) + + # get max identifier length + if destination_capabilities: + max_length = min( + destination_capabilities.max_identifier_length, + destination_capabilities.max_column_identifier_length, + ) + else: + max_length = None + names = class_(max_length) + return names + + def generate_dlt_ids(n_ids: int) -> List[str]: return many_uniq_ids_base64(n_ids, DLT_ID_LENGTH_BYTES) diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 6cefdd9e6c..c6ee27e58b 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -260,9 +260,6 @@ def asstr(self, verbosity: int = 0) -> str: return self._load_packages_asstr(self.load_packages, verbosity) -# reveal_type(ExtractInfo) - - class NormalizeMetrics(StepMetrics): job_metrics: Dict[str, DataWriterMetrics] """Metrics collected per job id during writing of job file""" diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 678f4de15e..2f016577ce 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -7,37 +7,45 @@ TSchemaContractEntities, TSchemaEvolutionMode, ) +from dlt.common.normalizers.naming import NamingConvention class SchemaException(DltException): - pass + def __init__(self, schema_name: str, msg: str) -> None: + self.schema_name = schema_name + if schema_name: + msg = f"In schema: {schema_name}: " + msg + super().__init__(msg) class InvalidSchemaName(ValueError, SchemaException): MAXIMUM_SCHEMA_NAME_LENGTH = 64 - def __init__(self, name: str) -> None: - self.name = name + def __init__(self, schema_name: str) -> None: + self.name = schema_name super().__init__( - f"{name} is an invalid schema/source name. The source or schema name must be a valid" - " Python identifier ie. a snake case function name and have maximum" + schema_name, + f"{schema_name} is an invalid schema/source name. The source or schema name must be a" + " valid Python identifier ie. a snake case function name and have maximum" f" {self.MAXIMUM_SCHEMA_NAME_LENGTH} characters. Ideally should contain only small" - " letters, numbers and underscores." + " letters, numbers and underscores.", ) -class InvalidDatasetName(ValueError, SchemaException): - def __init__(self, destination_name: str) -> None: - self.destination_name = destination_name - super().__init__( - f"Destination {destination_name} does not accept empty datasets. Please pass the" - " dataset name to the destination configuration ie. via dlt pipeline." - ) +# TODO: does not look like a SchemaException +# class InvalidDatasetName(ValueError, SchemaException): +# def __init__(self, destination_name: str) -> None: +# self.destination_name = destination_name +# super().__init__( +# f"Destination {destination_name} does not accept empty datasets. Please pass the" +# " dataset name to the destination configuration ie. via dlt pipeline." +# ) class CannotCoerceColumnException(SchemaException): def __init__( self, + schema_name: str, table_name: str, column_name: str, from_type: TDataType, @@ -50,37 +58,43 @@ def __init__( self.to_type = to_type self.coerced_value = coerced_value super().__init__( + schema_name, f"Cannot coerce type in table {table_name} column {column_name} existing type" - f" {from_type} coerced type {to_type} value: {coerced_value}" + f" {from_type} coerced type {to_type} value: {coerced_value}", ) class TablePropertiesConflictException(SchemaException): - def __init__(self, table_name: str, prop_name: str, val1: str, val2: str): + def __init__(self, schema_name: str, table_name: str, prop_name: str, val1: str, val2: str): self.table_name = table_name self.prop_name = prop_name self.val1 = val1 self.val2 = val2 super().__init__( + schema_name, f"Cannot merge partial tables for {table_name} due to property {prop_name}: {val1} !=" - f" {val2}" + f" {val2}", ) class ParentTableNotFoundException(SchemaException): - def __init__(self, table_name: str, parent_table_name: str, explanation: str = "") -> None: + def __init__( + self, schema_name: str, table_name: str, parent_table_name: str, explanation: str = "" + ) -> None: self.table_name = table_name self.parent_table_name = parent_table_name super().__init__( + schema_name, f"Parent table {parent_table_name} for {table_name} was not found in the" - f" schema.{explanation}" + f" schema.{explanation}", ) class CannotCoerceNullException(SchemaException): - def __init__(self, table_name: str, column_name: str) -> None: + def __init__(self, schema_name: str, table_name: str, column_name: str) -> None: super().__init__( - f"Cannot coerce NULL in table {table_name} column {column_name} which is not nullable" + schema_name, + f"Cannot coerce NULL in table {table_name} column {column_name} which is not nullable", ) @@ -88,19 +102,48 @@ class SchemaCorruptedException(SchemaException): pass +class SchemaIdentifierNormalizationCollision(SchemaCorruptedException): + def __init__( + self, + schema_name: str, + table_name: str, + identifier_type: str, + identifier_name: str, + conflict_identifier_name: str, + naming_name: str, + collision_msg: str, + ) -> None: + if identifier_type == "column": + table_info = f"in table {table_name} " + else: + table_info = "" + msg = ( + f"A {identifier_type} name {identifier_name} {table_info}collides with" + f" {conflict_identifier_name} after normalization with {naming_name} naming" + " convention. " + + collision_msg + ) + self.table_name = table_name + self.identifier_type = identifier_type + self.identifier_name = identifier_name + self.conflict_identifier_name = conflict_identifier_name + self.naming_name = naming_name + super().__init__(schema_name, msg) + + class SchemaEngineNoUpgradePathException(SchemaException): def __init__( self, schema_name: str, init_engine: int, from_engine: int, to_engine: int ) -> None: - self.schema_name = schema_name self.init_engine = init_engine self.from_engine = from_engine self.to_engine = to_engine super().__init__( + schema_name, f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}," f" stopped at {from_engine}. You possibly tried to run an older dlt" " version against a destination you have previously loaded data to with a newer dlt" - " version." + " version.", ) @@ -133,8 +176,7 @@ def __init__( + f" . Contract on {schema_entity} with mode {contract_mode} is violated. " + (extended_info or "") ) - super().__init__(msg) - self.schema_name = schema_name + super().__init__(schema_name, msg) self.table_name = table_name self.column_name = column_name @@ -148,10 +190,43 @@ def __init__( self.data_item = data_item -class UnknownTableException(SchemaException): - def __init__(self, table_name: str) -> None: +class UnknownTableException(KeyError, SchemaException): + def __init__(self, schema_name: str, table_name: str) -> None: self.table_name = table_name - super().__init__(f"Trying to access unknown table {table_name}.") + super().__init__(schema_name, f"Trying to access unknown table {table_name}.") + + +class TableIdentifiersFrozen(SchemaException): + def __init__( + self, + schema_name: str, + table_name: str, + to_naming: NamingConvention, + from_naming: NamingConvention, + details: str, + ) -> None: + self.table_name = table_name + self.to_naming = to_naming + self.from_naming = from_naming + msg = ( + f"Attempt to normalize identifiers for a table {table_name} from naming" + f" {from_naming.name()} to {to_naming.name()} changed one or more identifiers. " + ) + msg += ( + " This table already received data and tables were created at the destination. By" + " default changing the identifiers is not allowed. " + ) + msg += ( + " Such changes may result in creation of a new table or a new columns while the old" + " columns with data will still be kept. " + ) + msg += ( + " You may disable this behavior by setting" + " schema.allow_identifier_change_on_table_with_data to True or removing `x-normalizer`" + " hints from particular tables. " + ) + msg += f" Details: {details}" + super().__init__(schema_name, msg) class ColumnNameConflictException(SchemaException): diff --git a/dlt/common/schema/migrations.py b/dlt/common/schema/migrations.py index 9b206d61a6..1ef602a3f8 100644 --- a/dlt/common/schema/migrations.py +++ b/dlt/common/schema/migrations.py @@ -1,7 +1,7 @@ from typing import Dict, List, cast from dlt.common.data_types import TDataType -from dlt.common.normalizers import explicit_normalizers +from dlt.common.normalizers.utils import explicit_normalizers from dlt.common.typing import DictStrAny from dlt.common.schema.typing import ( LOADS_TABLE_NAME, @@ -14,7 +14,7 @@ from dlt.common.schema.exceptions import SchemaEngineNoUpgradePathException from dlt.common.normalizers.utils import import_normalizers -from dlt.common.schema.utils import new_table, version_table, load_table +from dlt.common.schema.utils import new_table, version_table, loads_table def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> TStoredSchema: @@ -92,11 +92,11 @@ def migrate_filters(group: str, filters: List[str]) -> None: if from_engine == 4 and to_engine > 4: # replace schema versions table schema_dict["tables"][VERSION_TABLE_NAME] = version_table() - schema_dict["tables"][LOADS_TABLE_NAME] = load_table() + schema_dict["tables"][LOADS_TABLE_NAME] = loads_table() from_engine = 5 if from_engine == 5 and to_engine > 5: # replace loads table - schema_dict["tables"][LOADS_TABLE_NAME] = load_table() + schema_dict["tables"][LOADS_TABLE_NAME] = loads_table() from_engine = 6 if from_engine == 6 and to_engine > 6: # migrate from sealed properties to schema evolution settings diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 6d5dc48907..fd0521cc14 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -1,8 +1,19 @@ from copy import copy, deepcopy -from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast, Literal +from typing import ( + Callable, + ClassVar, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, + Any, + cast, +) from dlt.common.schema.migrations import migrate_schema -from dlt.common.utils import extend_list_deduplicated +from dlt.common.utils import extend_list_deduplicated, get_full_class_name from dlt.common.typing import ( DictStrAny, StrAny, @@ -11,8 +22,8 @@ VARIANT_FIELD_FORMAT, TDataItem, ) -from dlt.common.normalizers import TNormalizersConfig, explicit_normalizers, import_normalizers -from dlt.common.normalizers.naming import NamingConvention +from dlt.common.normalizers import TNormalizersConfig, NamingConvention +from dlt.common.normalizers.utils import explicit_normalizers, import_normalizers from dlt.common.normalizers.json import DataItemNormalizer, TNormalizedRowIterator from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType @@ -22,7 +33,7 @@ SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, - STATE_TABLE_NAME, + PIPELINE_STATE_TABLE_NAME, TPartialTableSchema, TSchemaContractEntities, TSchemaEvolutionMode, @@ -45,6 +56,7 @@ InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException, + TableIdentifiersFrozen, ) from dlt.common.validation import validate_dict from dlt.common.schema.exceptions import DataValidationError @@ -102,13 +114,18 @@ def __init__(self, name: str, normalizers: TNormalizersConfig = None) -> None: self._reset_schema(name, normalizers) @classmethod - def from_dict(cls, d: DictStrAny, bump_version: bool = True) -> "Schema": + def from_dict( + cls, d: DictStrAny, remove_processing_hints: bool = False, bump_version: bool = True + ) -> "Schema": # upgrade engine if needed stored_schema = migrate_schema(d, d["engine_version"], cls.ENGINE_VERSION) # verify schema utils.validate_stored_schema(stored_schema) # add defaults stored_schema = utils.apply_defaults(stored_schema) + # remove processing hints that could be created by normalize and load steps + if remove_processing_hints: + utils.remove_processing_hints(stored_schema["tables"]) # bump version if modified if bump_version: @@ -141,30 +158,6 @@ def replace_schema_content( self._reset_schema(schema.name, schema._normalizers_config) self._from_stored_schema(stored_schema) - def to_dict(self, remove_defaults: bool = False, bump_version: bool = True) -> TStoredSchema: - stored_schema: TStoredSchema = { - "version": self._stored_version, - "version_hash": self._stored_version_hash, - "engine_version": Schema.ENGINE_VERSION, - "name": self._schema_name, - "tables": self._schema_tables, - "settings": self._settings, - "normalizers": self._normalizers_config, - "previous_hashes": self._stored_previous_hashes, - } - if self._imported_version_hash and not remove_defaults: - stored_schema["imported_version_hash"] = self._imported_version_hash - if self._schema_description: - stored_schema["description"] = self._schema_description - - # bump version if modified - if bump_version: - utils.bump_version_if_modified(stored_schema) - # remove defaults after bumping version - if remove_defaults: - utils.remove_defaults(stored_schema) - return stored_schema - def normalize_data_item( self, item: TDataItem, load_id: str, table_name: str ) -> TNormalizedRowIterator: @@ -317,7 +310,7 @@ def apply_schema_contract( column_mode, data_mode = schema_contract["columns"], schema_contract["data_type"] # allow to add new columns when table is new or if columns are allowed to evolve once - if is_new_table or existing_table.get("x-normalizer", {}).get("evolve-columns-once", False): # type: ignore[attr-defined] + if is_new_table or existing_table.get("x-normalizer", {}).get("evolve-columns-once", False): column_mode = "evolve" # check if we should filter any columns, partial table below contains only new columns @@ -402,14 +395,20 @@ def resolve_contract_settings_for_table( # expand settings, empty settings will expand into default settings return Schema.expand_schema_contract_settings(settings) - def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: - """Adds or merges `partial_table` into the schema. Identifiers are not normalized""" + def update_table( + self, partial_table: TPartialTableSchema, normalize_identifiers: bool = True + ) -> TPartialTableSchema: + """Adds or merges `partial_table` into the schema. Identifiers are normalized by default""" + if normalize_identifiers: + partial_table = utils.normalize_table_identifiers(partial_table, self.naming) + table_name = partial_table["name"] parent_table_name = partial_table.get("parent") # check if parent table present if parent_table_name is not None: if self._schema_tables.get(parent_table_name) is None: raise ParentTableNotFoundException( + self.name, table_name, parent_table_name, " This may be due to misconfigured excludes filter that fully deletes content" @@ -422,21 +421,20 @@ def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchem self._schema_tables[table_name] = partial_table else: # merge tables performing additional checks - partial_table = utils.merge_table(table, partial_table) + partial_table = utils.merge_table(self.name, table, partial_table) self.data_item_normalizer.extend_table(table_name) return partial_table def update_schema(self, schema: "Schema") -> None: """Updates this schema from an incoming schema. Normalizes identifiers after updating normalizers.""" - # update all tables - for table in schema.tables.values(): - self.update_table(table) # pass normalizer config - self._configure_normalizers(schema._normalizers_config) - # update and compile settings self._settings = deepcopy(schema.settings) + self._configure_normalizers(schema._normalizers_config) self._compile_settings() + # update all tables + for table in schema.tables.values(): + self.update_table(table) def drop_tables( self, table_names: Sequence[str], seen_data_only: bool = False @@ -467,67 +465,60 @@ def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: Str # dicts are ordered and we will return the rows with hints in the same order as they appear in the columns return rv_row - def merge_hints(self, new_hints: Mapping[TColumnHint, Sequence[TSimpleRegex]]) -> None: - # validate regexes - validate_dict( - TSchemaSettings, - {"default_hints": new_hints}, - ".", - validator_f=utils.simple_regex_validator, - ) - # prepare hints to be added - default_hints = self._settings.setdefault("default_hints", {}) - # add `new_hints` to existing hints - for h, l in new_hints.items(): - if h in default_hints: - extend_list_deduplicated(default_hints[h], l) - else: - # set new hint type - default_hints[h] = l # type: ignore + def merge_hints( + self, + new_hints: Mapping[TColumnHint, Sequence[TSimpleRegex]], + normalize_identifiers: bool = True, + ) -> None: + """Merges existing default hints with `new_hints`. Normalizes names in column regexes if possible. Compiles setting at the end + + NOTE: you can manipulate default hints collection directly via `Schema.settings` as long as you call Schema._compile_settings() at the end. + """ + self._merge_hints(new_hints, normalize_identifiers) self._compile_settings() - def normalize_table_identifiers(self, table: TTableSchema) -> TTableSchema: - """Normalizes all table and column names in `table` schema according to current schema naming convention and returns - new normalized TTableSchema instance. + def update_preferred_types( + self, + new_preferred_types: Mapping[TSimpleRegex, TDataType], + normalize_identifiers: bool = True, + ) -> None: + """Updates preferred types dictionary with `new_preferred_types`. Normalizes names in column regexes if possible. Compiles setting at the end - Naming convention like snake_case may produce name clashes with the column names. Clashing column schemas are merged - where the column that is defined later in the dictionary overrides earlier column. + NOTE: you can manipulate preferred hints collection directly via `Schema.settings` as long as you call Schema._compile_settings() at the end. + """ + self._update_preferred_types(new_preferred_types, normalize_identifiers) + self._compile_settings() - Note that resource name is not normalized. + def add_type_detection(self, detection: TTypeDetections) -> None: + """Add type auto detection to the schema.""" + if detection not in self.settings["detections"]: + self.settings["detections"].append(detection) + self._compile_settings() - """ - # normalize all identifiers in table according to name normalizer of the schema - table["name"] = self.naming.normalize_tables_path(table["name"]) - parent = table.get("parent") - if parent: - table["parent"] = self.naming.normalize_tables_path(parent) - columns = table.get("columns") - if columns: - new_columns: TTableSchemaColumns = {} - for c in columns.values(): - new_col_name = c["name"] = self.naming.normalize_path(c["name"]) - # re-index columns as the name changed, if name space was reduced then - # some columns now clash with each other. so make sure that we merge columns that are already there - if new_col_name in new_columns: - new_columns[new_col_name] = utils.merge_column( - new_columns[new_col_name], c, merge_defaults=False - ) - else: - new_columns[new_col_name] = c - table["columns"] = new_columns - return table + def remove_type_detection(self, detection: TTypeDetections) -> None: + """Adds type auto detection to the schema.""" + if detection in self.settings["detections"]: + self.settings["detections"].remove(detection) + self._compile_settings() def get_new_table_columns( self, table_name: str, - exiting_columns: TTableSchemaColumns, + existing_columns: TTableSchemaColumns, + case_sensitive: bool = True, include_incomplete: bool = False, ) -> List[TColumnSchema]: - """Gets new columns to be added to `exiting_columns` to bring them up to date with `table_name` schema. Optionally includes incomplete columns (without data type)""" + """Gets new columns to be added to `existing_columns` to bring them up to date with `table_name` schema. + Columns names are compared case sensitive by default. + Optionally includes incomplete columns (without data type)""" + casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] + casefold_existing = { + casefold_f(col_name): col for col_name, col in existing_columns.items() + } diff_c: List[TColumnSchema] = [] s_t = self.get_table_columns(table_name, include_incomplete=include_incomplete) for c in s_t.values(): - if c["name"] not in exiting_columns: + if casefold_f(c["name"]) not in casefold_existing: diff_c.append(c) return diff_c @@ -651,20 +642,77 @@ def tables(self) -> TSchemaTables: def settings(self) -> TSchemaSettings: return self._settings - def to_pretty_json(self, remove_defaults: bool = True) -> str: - d = self.to_dict(remove_defaults=remove_defaults) + def to_dict( + self, + remove_defaults: bool = False, + remove_processing_hints: bool = False, + bump_version: bool = True, + ) -> TStoredSchema: + # prepare normalizers + if isinstance(self._normalizers_config["names"], NamingConvention): + normalizers_config = deepcopy(self._normalizers_config) + normalizers_config["names"] = get_full_class_name(normalizers_config["names"]) + else: + normalizers_config = self._normalizers_config + + stored_schema: TStoredSchema = { + "version": self._stored_version, + "version_hash": self._stored_version_hash, + "engine_version": Schema.ENGINE_VERSION, + "name": self._schema_name, + "tables": self._schema_tables, + "settings": self._settings, + "normalizers": normalizers_config, + "previous_hashes": self._stored_previous_hashes, + } + if self._imported_version_hash and not remove_defaults: + stored_schema["imported_version_hash"] = self._imported_version_hash + if self._schema_description: + stored_schema["description"] = self._schema_description + + # remove processing hints that could be created by normalize and load steps + if remove_processing_hints: + stored_schema["tables"] = utils.remove_processing_hints( + deepcopy(stored_schema["tables"]) + ) + + # bump version if modified + if bump_version: + utils.bump_version_if_modified(stored_schema) + # remove defaults after bumping version + if remove_defaults: + utils.remove_defaults(stored_schema) + return stored_schema + + def to_pretty_json( + self, remove_defaults: bool = True, remove_processing_hints: bool = False + ) -> str: + d = self.to_dict( + remove_defaults=remove_defaults, remove_processing_hints=remove_processing_hints + ) return utils.to_pretty_json(d) - def to_pretty_yaml(self, remove_defaults: bool = True) -> str: - d = self.to_dict(remove_defaults=remove_defaults) + def to_pretty_yaml( + self, remove_defaults: bool = True, remove_processing_hints: bool = False + ) -> str: + d = self.to_dict( + remove_defaults=remove_defaults, remove_processing_hints=remove_processing_hints + ) return utils.to_pretty_yaml(d) - def clone(self, with_name: str = None, update_normalizers: bool = False) -> "Schema": - """Make a deep copy of the schema, optionally changing the name, and updating normalizers and identifiers in the schema if `update_normalizers` is True - - Note that changing of name will set the schema as new + def clone( + self, + with_name: str = None, + remove_processing_hints: bool = False, + update_normalizers: bool = False, + ) -> "Schema": + """Make a deep copy of the schema, optionally changing the name, removing processing markers and updating normalizers and identifiers in the schema if `update_normalizers` is True + Processing markers are `x-` hints created by normalizer (`x-normalizer`) and loader (`x-loader`) to ie. mark newly inferred tables and tables that seen data. + Note that changing of name will break the previous version chain """ - d = deepcopy(self.to_dict(bump_version=False)) + d = deepcopy( + self.to_dict(bump_version=False, remove_processing_hints=remove_processing_hints) + ) if with_name is not None: d["version"] = d["version_hash"] = None d.pop("imported_version_hash", None) @@ -677,12 +725,19 @@ def clone(self, with_name: str = None, update_normalizers: bool = False) -> "Sch return schema def update_normalizers(self) -> None: - """Looks for new normalizer configuration or for destination capabilities context and updates all identifiers in the schema""" - normalizers = explicit_normalizers() + """Looks for new normalizer configuration or for destination capabilities context and updates all identifiers in the schema + + Table and column names will be normalized with new naming convention, except tables that have seen data ('x-normalizer`) which will + raise if any identifier is to be changed. + Default hints, preferred data types and normalize configs (ie. column propagation) are normalized as well. Regexes are included as long + as textual parts can be extracted from an expression. + """ + normalizers = explicit_normalizers(schema_name=self._schema_name) # set the current values as defaults normalizers["names"] = normalizers["names"] or self._normalizers_config["names"] normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] self._configure_normalizers(normalizers) + self._compile_settings() def set_schema_contract(self, settings: TSchemaContract) -> None: if not settings: @@ -690,18 +745,6 @@ def set_schema_contract(self, settings: TSchemaContract) -> None: else: self._settings["schema_contract"] = settings - def add_type_detection(self, detection: TTypeDetections) -> None: - """Add type auto detection to the schema.""" - if detection not in self.settings["detections"]: - self.settings["detections"].append(detection) - self._compile_settings() - - def remove_type_detection(self, detection: TTypeDetections) -> None: - """Adds type auto detection to the schema.""" - if detection in self.settings["detections"]: - self.settings["detections"].remove(detection) - self._compile_settings() - def _infer_column( self, k: str, v: Any, data_type: TDataType = None, is_variant: bool = False ) -> TColumnSchema: @@ -727,7 +770,7 @@ def _coerce_null_value( if col_name in table_columns: existing_column = table_columns[col_name] if not existing_column.get("nullable", True): - raise CannotCoerceNullException(table_name, col_name) + raise CannotCoerceNullException(self.name, table_name, col_name) def _coerce_non_null_value( self, @@ -759,7 +802,12 @@ def _coerce_non_null_value( if is_variant: # this is final call: we cannot generate any more auto-variants raise CannotCoerceColumnException( - table_name, col_name, py_type, table_columns[col_name]["data_type"], v + self.name, + table_name, + col_name, + py_type, + table_columns[col_name]["data_type"], + v, ) # otherwise we must create variant extension to the table # pass final=True so no more auto-variants can be created recursively @@ -816,6 +864,57 @@ def _infer_hint(self, hint_type: TColumnHint, _: Any, col_name: str) -> bool: else: return False + def _merge_hints( + self, + new_hints: Mapping[TColumnHint, Sequence[TSimpleRegex]], + normalize_identifiers: bool = True, + ) -> None: + """Used by `merge_hints method, does not compile settings at the end""" + # validate regexes + validate_dict( + TSchemaSettings, + {"default_hints": new_hints}, + ".", + validator_f=utils.simple_regex_validator, + ) + if normalize_identifiers: + new_hints = self._normalize_default_hints(new_hints) + # prepare hints to be added + default_hints = self._settings.setdefault("default_hints", {}) + # add `new_hints` to existing hints + for h, l in new_hints.items(): + if h in default_hints: + extend_list_deduplicated(default_hints[h], l, utils.canonical_simple_regex) + else: + # set new hint type + default_hints[h] = l # type: ignore + + def _update_preferred_types( + self, + new_preferred_types: Mapping[TSimpleRegex, TDataType], + normalize_identifiers: bool = True, + ) -> None: + # validate regexes + validate_dict( + TSchemaSettings, + {"preferred_types": new_preferred_types}, + ".", + validator_f=utils.simple_regex_validator, + ) + if normalize_identifiers: + new_preferred_types = self._normalize_preferred_types(new_preferred_types) + preferred_types = self._settings.setdefault("preferred_types", {}) + # we must update using canonical simple regex + canonical_preferred = { + utils.canonical_simple_regex(rx): rx for rx in preferred_types.keys() + } + for new_rx, new_dt in new_preferred_types.items(): + canonical_new_rx = utils.canonical_simple_regex(new_rx) + if canonical_new_rx not in canonical_preferred: + preferred_types[new_rx] = new_dt + else: + preferred_types[canonical_preferred[canonical_new_rx]] = new_dt + def _bump_version(self) -> Tuple[int, str]: """Computes schema hash in order to check if schema content was modified. In such case the schema ``stored_version`` and ``stored_version_hash`` are updated. @@ -839,40 +938,124 @@ def _drop_version(self) -> None: self._stored_version_hash = self._stored_previous_hashes.pop(0) def _add_standard_tables(self) -> None: - self._schema_tables[self.version_table_name] = self.normalize_table_identifiers( - utils.version_table() + self._schema_tables[self.version_table_name] = utils.normalize_table_identifiers( + utils.version_table(), self.naming ) - self._schema_tables[self.loads_table_name] = self.normalize_table_identifiers( - utils.load_table() + self._schema_tables[self.loads_table_name] = utils.normalize_table_identifiers( + utils.loads_table(), self.naming ) def _add_standard_hints(self) -> None: - default_hints = utils.standard_hints() + default_hints = utils.default_hints() if default_hints: - self._settings["default_hints"] = default_hints + self._merge_hints(default_hints, normalize_identifiers=False) type_detections = utils.standard_type_detections() if type_detections: self._settings["detections"] = type_detections - def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: - # import desired modules - self._normalizers_config, naming_module, item_normalizer_class = import_normalizers( - normalizers - ) - # print(f"{self.name}: {type(self.naming)} {type(naming_module)}") - if self.naming and type(self.naming) is not type(naming_module): - self.naming = naming_module + def _normalize_default_hints( + self, default_hints: Mapping[TColumnHint, Sequence[TSimpleRegex]] + ) -> Dict[TColumnHint, List[TSimpleRegex]]: + """Normalizes the column names in default hints. In case of column names that are regexes, normalization is skipped""" + return { + hint: [utils.normalize_simple_regex_column(self.naming, regex) for regex in regexes] + for hint, regexes in default_hints.items() + } + + def _normalize_preferred_types( + self, preferred_types: Mapping[TSimpleRegex, TDataType] + ) -> Dict[TSimpleRegex, TDataType]: + """Normalizes the column names in preferred types mapping. In case of column names that are regexes, normalization is skipped""" + return { + utils.normalize_simple_regex_column(self.naming, regex): data_type + for regex, data_type in preferred_types.items() + } + + def _verify_update_normalizers( + self, + normalizers_config: TNormalizersConfig, + to_naming: NamingConvention, + from_naming: NamingConvention, + ) -> TSchemaTables: + """Verifies if normalizers can be updated before schema is changed""" + # print(f"{self.name}: {type(to_naming)} {type(naming_module)}") + if from_naming and type(from_naming) is not type(to_naming): + schema_tables = {} for table in self._schema_tables.values(): - self.normalize_table_identifiers(table) + norm_table = utils.normalize_table_identifiers(table, to_naming) + if utils.has_table_seen_data(norm_table) and not normalizers_config.get( + "allow_identifier_change_on_table_with_data", False + ): + # make sure no identifier got changed in table + if norm_table["name"] != table["name"]: + raise TableIdentifiersFrozen( + self.name, + table["name"], + to_naming, + from_naming, + f"Attempt to rename table name to {norm_table['name']}.", + ) + if len(norm_table["columns"]) != len(table["columns"]): + raise TableIdentifiersFrozen( + self.name, + table["name"], + to_naming, + from_naming, + "Number of columns changed after normalization. Some columns must have" + " merged.", + ) + col_diff = set(norm_table["columns"].keys()).difference(table["columns"].keys()) + if len(col_diff) > 0: + raise TableIdentifiersFrozen( + self.name, + table["name"], + to_naming, + from_naming, + f"Some columns got renamed to {col_diff}.", + ) + schema_tables[norm_table["name"]] = norm_table # re-index the table names - self._schema_tables = {t["name"]: t for t in self._schema_tables.values()} + return schema_tables + else: + return self._schema_tables + def _renormalize_schema_identifiers( + self, + normalizers_config: TNormalizersConfig, + to_naming: NamingConvention, + from_naming: NamingConvention, + ) -> None: + """Normalizes all identifiers in the schema in place""" + self._schema_tables = self._verify_update_normalizers( + normalizers_config, to_naming, from_naming + ) + self._normalizers_config = normalizers_config + self.naming = to_naming # name normalization functions - self.naming = naming_module - self._dlt_tables_prefix = self.naming.normalize_table_identifier(DLT_NAME_PREFIX) - self.version_table_name = self.naming.normalize_table_identifier(VERSION_TABLE_NAME) - self.loads_table_name = self.naming.normalize_table_identifier(LOADS_TABLE_NAME) - self.state_table_name = self.naming.normalize_table_identifier(STATE_TABLE_NAME) + self._dlt_tables_prefix = to_naming.normalize_table_identifier(DLT_NAME_PREFIX) + self.version_table_name = to_naming.normalize_table_identifier(VERSION_TABLE_NAME) + self.loads_table_name = to_naming.normalize_table_identifier(LOADS_TABLE_NAME) + self.state_table_name = to_naming.normalize_table_identifier(PIPELINE_STATE_TABLE_NAME) + # do a sanity check - dlt tables must start with dlt prefix + for table_name in [self.version_table_name, self.loads_table_name, self.state_table_name]: + if not table_name.startswith(self._dlt_tables_prefix): + raise SchemaCorruptedException( + self.name, + f"A naming convention {self.naming.name()} mangles _dlt table prefix to" + f" '{self._dlt_tables_prefix}'. A table '{table_name}' does not start with it.", + ) + # normalize default hints + if default_hints := self._settings.get("default_hints"): + self._settings["default_hints"] = self._normalize_default_hints(default_hints) + # normalized preferred types + if preferred_types := self.settings.get("preferred_types"): + self._settings["preferred_types"] = self._normalize_preferred_types(preferred_types) + + def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: + """Gets naming and item normalizer from schema yaml, config providers and destination capabilities and applies them to schema.""" + # import desired modules + normalizers_config, to_naming, item_normalizer_class = import_normalizers(normalizers) + self._renormalize_schema_identifiers(normalizers_config, to_naming, self.naming) # data item normalization function self.data_item_normalizer = item_normalizer_class(self) self.data_item_normalizer.extend_schema() @@ -903,7 +1086,7 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No self._add_standard_hints() # configure normalizers, including custom config if present if not normalizers: - normalizers = explicit_normalizers() + normalizers = explicit_normalizers(schema_name=self._schema_name) self._configure_normalizers(normalizers) # add version tables self._add_standard_tables() @@ -913,9 +1096,13 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No def _from_stored_schema(self, stored_schema: TStoredSchema) -> None: self._schema_tables = stored_schema.get("tables") or {} if self.version_table_name not in self._schema_tables: - raise SchemaCorruptedException(f"Schema must contain table {self.version_table_name}") + raise SchemaCorruptedException( + stored_schema["name"], f"Schema must contain table {self.version_table_name}" + ) if self.loads_table_name not in self._schema_tables: - raise SchemaCorruptedException(f"Schema must contain table {self.loads_table_name}") + raise SchemaCorruptedException( + stored_schema["name"], f"Schema must contain table {self.loads_table_name}" + ) self._stored_version = stored_schema["version"] self._stored_version_hash = stored_schema["version_hash"] self._imported_version_hash = stored_schema.get("imported_version_hash") diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index fb360b38d3..b5081c5ff4 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -17,8 +17,7 @@ from dlt.common.data_types import TDataType from dlt.common.normalizers.typing import TNormalizersConfig -from dlt.common.typing import TSortOrder, TAnyDateTime -from dlt.common.pendulum import pendulum +from dlt.common.typing import TSortOrder, TAnyDateTime, TLoaderFileFormat try: from pydantic import BaseModel as _PydanticBaseModel @@ -32,7 +31,7 @@ # dlt tables VERSION_TABLE_NAME = "_dlt_version" LOADS_TABLE_NAME = "_dlt_loads" -STATE_TABLE_NAME = "_dlt_pipeline_state" +PIPELINE_STATE_TABLE_NAME = "_dlt_pipeline_state" DLT_NAME_PREFIX = "_dlt" TColumnProp = Literal[ @@ -47,6 +46,7 @@ "unique", "merge_key", "root_key", + "hard_delete", "dedup_sort", ] """Known properties and hints of the column""" @@ -59,12 +59,16 @@ "foreign_key", "sort", "unique", - "root_key", "merge_key", + "root_key", + "hard_delete", "dedup_sort", ] """Known hints of a column used to declare hint regexes.""" + +TWriteDisposition = Literal["skip", "append", "replace", "merge"] TTableFormat = Literal["iceberg", "delta"] +TFileFormat = Literal[Literal["preferred"], TLoaderFileFormat] TTypeDetections = Literal[ "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double" ] @@ -72,7 +76,7 @@ TColumnNames = Union[str, Sequence[str]] """A string representing a column name or a list of""" -COLUMN_PROPS: Set[TColumnProp] = set(get_args(TColumnProp)) +# COLUMN_PROPS: Set[TColumnProp] = set(get_args(TColumnProp)) COLUMN_HINTS: Set[TColumnHint] = set( [ "partition", @@ -153,9 +157,18 @@ class NormalizerInfo(TypedDict, total=True): new_table: bool -TWriteDisposition = Literal["skip", "append", "replace", "merge"] -TLoaderMergeStrategy = Literal["delete-insert", "scd2"] +# Part of Table containing processing hints added by pipeline stages +TTableProcessingHints = TypedDict( + "TTableProcessingHints", + { + "x-normalizer": Optional[Dict[str, Any]], + "x-loader": Optional[Dict[str, Any]], + "x-extractor": Optional[Dict[str, Any]], + }, + total=False, +) +TLoaderMergeStrategy = Literal["delete-insert", "scd2"] WRITE_DISPOSITIONS: Set[TWriteDisposition] = set(get_args(TWriteDisposition)) MERGE_STRATEGIES: Set[TLoaderMergeStrategy] = set(get_args(TLoaderMergeStrategy)) @@ -178,7 +191,8 @@ class TMergeDispositionDict(TWriteDispositionDict, total=False): TWriteDispositionConfig = Union[TWriteDisposition, TWriteDispositionDict, TMergeDispositionDict] -class TTableSchema(TypedDict, total=False): +# TypedDict that defines properties of a table +class TTableSchema(TTableProcessingHints, total=False): """TypedDict that defines properties of a table""" name: Optional[str] @@ -191,6 +205,7 @@ class TTableSchema(TypedDict, total=False): columns: TTableSchemaColumns resource: Optional[str] table_format: Optional[TTableFormat] + file_format: Optional[TFileFormat] class TPartialTableSchema(TTableSchema): diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 51269cbb38..f5765be351 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -7,6 +7,7 @@ from dlt.common.pendulum import pendulum from dlt.common.time import ensure_pendulum_datetime +from dlt.common import logger from dlt.common.json import json from dlt.common.data_types import TDataType from dlt.common.exceptions import DictValidationException @@ -21,12 +22,15 @@ LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, + PIPELINE_STATE_TABLE_NAME, TColumnName, + TFileFormat, TPartialTableSchema, TSchemaTables, TSchemaUpdate, TSimpleRegex, TStoredSchema, + TTableProcessingHints, TTableSchema, TColumnSchemaBase, TColumnSchema, @@ -96,7 +100,8 @@ def apply_defaults(stored_schema: TStoredSchema) -> TStoredSchema: def remove_defaults(stored_schema: TStoredSchema) -> TStoredSchema: """Removes default values from `stored_schema` in place, returns the input for chaining - Default values are removed from table schemas and complete column schemas. Incomplete columns are preserved intact. + * removes column and table names from the value + * removed resource name if same as table name """ clean_tables = deepcopy(stored_schema["tables"]) for table_name, t in clean_tables.items(): @@ -202,6 +207,33 @@ def verify_schema_hash( return hash_ == stored_schema["version_hash"] +def normalize_simple_regex_column(naming: NamingConvention, regex: TSimpleRegex) -> TSimpleRegex: + """Assumes that regex applies to column name and normalizes it.""" + + def _normalize(r_: str) -> str: + is_exact = len(r_) >= 2 and r_[0] == "^" and r_[-1] == "$" + if is_exact: + r_ = r_[1:-1] + # if this a simple string then normalize it + if r_ == re.escape(r_): + r_ = naming.normalize_path(r_) + if is_exact: + r_ = "^" + r_ + "$" + return r_ + + if regex.startswith(SIMPLE_REGEX_PREFIX): + return cast(TSimpleRegex, SIMPLE_REGEX_PREFIX + _normalize(regex[3:])) + else: + return cast(TSimpleRegex, _normalize(regex)) + + +def canonical_simple_regex(regex: str) -> TSimpleRegex: + if regex.startswith(SIMPLE_REGEX_PREFIX): + return cast(TSimpleRegex, regex) + else: + return cast(TSimpleRegex, SIMPLE_REGEX_PREFIX + "^" + regex + "$") + + def simple_regex_validator(path: str, pk: str, pv: Any, t: Any) -> bool: # custom validator on type TSimpleRegex if t is TSimpleRegex: @@ -237,7 +269,7 @@ def simple_regex_validator(path: str, pk: str, pv: Any, t: Any) -> bool: # we know how to validate that type return True else: - # don't know how to validate t + # don't know how to validate this return False @@ -299,7 +331,9 @@ def validate_stored_schema(stored_schema: TStoredSchema) -> None: parent_table_name = table.get("parent") if parent_table_name: if parent_table_name not in stored_schema["tables"]: - raise ParentTableNotFoundException(table_name, parent_table_name) + raise ParentTableNotFoundException( + stored_schema["name"], table_name, parent_table_name + ) def autodetect_sc_type(detection_fs: Sequence[TTypeDetections], t: Type[Any], v: Any) -> TDataType: @@ -370,7 +404,9 @@ def merge_columns( return columns_a -def diff_table(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTableSchema: +def diff_table( + schema_name: str, tab_a: TTableSchema, tab_b: TPartialTableSchema +) -> TPartialTableSchema: """Creates a partial table that contains properties found in `tab_b` that are not present or different in `tab_a`. The name is always present in returned partial. It returns new columns (not present in tab_a) and merges columns from tab_b into tab_a (overriding non-default hint values). @@ -384,7 +420,7 @@ def diff_table(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTable # check if table properties can be merged if tab_a.get("parent") != tab_b.get("parent"): raise TablePropertiesConflictException( - table_name, "parent", tab_a.get("parent"), tab_b.get("parent") + schema_name, table_name, "parent", tab_a.get("parent"), tab_b.get("parent") ) # get new columns, changes in the column data type or other properties are not allowed @@ -398,6 +434,7 @@ def diff_table(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTable if not compare_complete_columns(tab_a_columns[col_b_name], col_b): # attempt to update to incompatible columns raise CannotCoerceColumnException( + schema_name, table_name, col_b_name, col_b["data_type"], @@ -426,7 +463,7 @@ def diff_table(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTable # this should not really happen if tab_a.get("parent") is not None and (resource := tab_b.get("resource")): raise TablePropertiesConflictException( - table_name, "resource", resource, tab_a.get("parent") + schema_name, table_name, "resource", resource, tab_a.get("parent") ) return partial_table @@ -444,7 +481,9 @@ def diff_table(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTable # return False -def merge_table(table: TTableSchema, partial_table: TPartialTableSchema) -> TPartialTableSchema: +def merge_table( + schema_name: str, table: TTableSchema, partial_table: TPartialTableSchema +) -> TPartialTableSchema: """Merges "partial_table" into "table". `table` is merged in place. Returns the diff partial table. `table` and `partial_table` names must be identical. A table diff is generated and applied to `table`: @@ -456,9 +495,10 @@ def merge_table(table: TTableSchema, partial_table: TPartialTableSchema) -> TPar if table["name"] != partial_table["name"]: raise TablePropertiesConflictException( - table["name"], "name", table["name"], partial_table["name"] + schema_name, table["name"], "name", table["name"], partial_table["name"] ) - diff = diff_table(table, partial_table) + diff = diff_table(schema_name, table, partial_table) + # add new columns when all checks passed updated_columns = merge_columns(table["columns"], diff["columns"]) table.update(diff) table["columns"] = updated_columns @@ -466,9 +506,67 @@ def merge_table(table: TTableSchema, partial_table: TPartialTableSchema) -> TPar return diff +def normalize_table_identifiers(table: TTableSchema, naming: NamingConvention) -> TTableSchema: + """Normalizes all table and column names in `table` schema according to current schema naming convention and returns + new instance with modified table schema. + + Naming convention like snake_case may produce name collisions with the column names. Colliding column schemas are merged + where the column that is defined later in the dictionary overrides earlier column. + + Note that resource name is not normalized. + """ + + table = copy(table) + table["name"] = naming.normalize_tables_path(table["name"]) + parent = table.get("parent") + if parent: + table["parent"] = naming.normalize_tables_path(parent) + columns = table.get("columns") + if columns: + new_columns: TTableSchemaColumns = {} + for c in columns.values(): + c = copy(c) + origin_c_name = c["name"] + new_col_name = c["name"] = naming.normalize_path(c["name"]) + # re-index columns as the name changed, if name space was reduced then + # some columns now collide with each other. so make sure that we merge columns that are already there + if new_col_name in new_columns: + new_columns[new_col_name] = merge_column( + new_columns[new_col_name], c, merge_defaults=False + ) + logger.warning( + f"In schema {naming} column {origin_c_name} got normalized into" + f" {new_col_name} which collides with other column. Both columns got merged" + " into one." + ) + else: + new_columns[new_col_name] = c + table["columns"] = new_columns + return table + + def has_table_seen_data(table: TTableSchema) -> bool: """Checks if normalizer has seen data coming to the table.""" - return "x-normalizer" in table and table["x-normalizer"].get("seen-data", None) is True # type: ignore[typeddict-item] + return "x-normalizer" in table and table["x-normalizer"].get("seen-data", None) is True + + +def remove_processing_hints(tables: TSchemaTables) -> TSchemaTables: + "Removes processing hints like x-normalizer and x-loader from schema tables. Modifies the input tables and returns it for convenience" + for table_name, hints in get_processing_hints(tables).items(): + for hint in hints: + del tables[table_name][hint] # type: ignore[misc] + return tables + + +def get_processing_hints(tables: TSchemaTables) -> Dict[str, List[str]]: + """Finds processing hints in a set of tables and returns table_name: [hints] mapping""" + hints: Dict[str, List[str]] = {} + for table in tables.values(): + for hint in TTableProcessingHints.__annotations__.keys(): + if hint in table: + table_hints = hints.setdefault(table["name"], []) + table_hints.append(hint) + return hints def hint_to_column_prop(h: TColumnHint) -> TColumnProp: @@ -581,6 +679,12 @@ def get_table_format(tables: TSchemaTables, table_name: str) -> TTableFormat: ) +def get_file_format(tables: TSchemaTables, table_name: str) -> TFileFormat: + return cast( + TFileFormat, get_inherited_table_hint(tables, table_name, "file_format", allow_none=True) + ) + + def fill_hints_from_parent_and_clone_table( tables: TSchemaTables, table: TTableSchema ) -> TTableSchema: @@ -592,6 +696,8 @@ def fill_hints_from_parent_and_clone_table( table["write_disposition"] = get_write_disposition(tables, table["name"]) if "table_format" not in table: table["table_format"] = get_table_format(tables, table["name"]) + if "file_format" not in table: + table["file_format"] = get_file_format(tables, table["name"]) return table @@ -650,6 +756,8 @@ def group_tables_by_resource( def version_table() -> TTableSchema: # NOTE: always add new columns at the end of the table so we have identical layout # after an update of existing tables (always at the end) + # set to nullable so we can migrate existing tables + # WARNING: do not reorder the columns table = new_table( VERSION_TABLE_NAME, columns=[ @@ -670,9 +778,11 @@ def version_table() -> TTableSchema: return table -def load_table() -> TTableSchema: +def loads_table() -> TTableSchema: # NOTE: always add new columns at the end of the table so we have identical layout # after an update of existing tables (always at the end) + # set to nullable so we can migrate existing tables + # WARNING: do not reorder the columns table = new_table( LOADS_TABLE_NAME, columns=[ @@ -692,6 +802,30 @@ def load_table() -> TTableSchema: return table +def pipeline_state_table() -> TTableSchema: + # NOTE: always add new columns at the end of the table so we have identical layout + # after an update of existing tables (always at the end) + # set to nullable so we can migrate existing tables + # WARNING: do not reorder the columns + table = new_table( + PIPELINE_STATE_TABLE_NAME, + write_disposition="append", + columns=[ + {"name": "version", "data_type": "bigint", "nullable": False}, + {"name": "engine_version", "data_type": "bigint", "nullable": False}, + {"name": "pipeline_name", "data_type": "text", "nullable": False}, + {"name": "state", "data_type": "text", "nullable": False}, + {"name": "created_at", "data_type": "timestamp", "nullable": False}, + {"name": "version_hash", "data_type": "text", "nullable": True}, + {"name": "_dlt_load_id", "data_type": "text", "nullable": False}, + ], + # always use caps preferred file format for processing + file_format="preferred", + ) + table["description"] = "Created by DLT. Tracks pipeline state" + return table + + def new_table( table_name: str, parent_table_name: str = None, @@ -701,6 +835,7 @@ def new_table( resource: str = None, schema_contract: TSchemaContract = None, table_format: TTableFormat = None, + file_format: TFileFormat = None, ) -> TTableSchema: table: TTableSchema = { "name": table_name, @@ -719,6 +854,8 @@ def new_table( table["schema_contract"] = schema_contract if table_format: table["table_format"] = table_format + if file_format: + table["file_format"] = file_format if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, @@ -754,7 +891,7 @@ def new_column( return column -def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: +def default_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: return None diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py index f6072c0260..29a9da8acf 100644 --- a/dlt/common/storages/data_item_storage.py +++ b/dlt/common/storages/data_item_storage.py @@ -60,15 +60,18 @@ def import_items_file( table_name: str, file_path: str, metrics: DataWriterMetrics, + with_extension: str = None, ) -> DataWriterMetrics: """Import a file from `file_path` into items storage under a new file name. Does not check the imported file format. Uses counts from `metrics` as a base. Logically closes the imported file The preferred import method is a hard link to avoid copying the data. If current filesystem does not support it, a regular copy is used. + + Alternative extension may be provided via `with_extension` so various file formats may be imported into the same folder. """ writer = self._get_writer(load_id, schema_name, table_name) - return writer.import_file(file_path, metrics) + return writer.import_file(file_path, metrics, with_extension) def close_writers(self, load_id: str, skip_flush: bool = False) -> None: """Flush, write footers (skip_flush), write metrics and close files in all diff --git a/dlt/common/storages/exceptions.py b/dlt/common/storages/exceptions.py index 26a76bb5c0..028491dd9c 100644 --- a/dlt/common/storages/exceptions.py +++ b/dlt/common/storages/exceptions.py @@ -79,6 +79,23 @@ def __init__(self, load_id: str) -> None: super().__init__(f"Package with load id {load_id} could not be found") +class LoadPackageAlreadyCompleted(LoadStorageException): + def __init__(self, load_id: str) -> None: + self.load_id = load_id + super().__init__( + f"Package with load id {load_id} is already completed, but another complete was" + " requested" + ) + + +class LoadPackageNotCompleted(LoadStorageException): + def __init__(self, load_id: str) -> None: + self.load_id = load_id + super().__init__( + f"Package with load id {load_id} is not yet completed, but method required that" + ) + + class SchemaStorageException(StorageException): pass diff --git a/dlt/common/storages/file_storage.py b/dlt/common/storages/file_storage.py index d768ec720a..7d14b8f7f7 100644 --- a/dlt/common/storages/file_storage.py +++ b/dlt/common/storages/file_storage.py @@ -6,7 +6,7 @@ import tempfile import shutil import pathvalidate -from typing import IO, Any, Optional, List, cast, overload +from typing import IO, Any, Optional, List, cast from dlt.common.typing import AnyFun from dlt.common.utils import encoding_for_mode, uniq_id @@ -18,7 +18,7 @@ class FileStorage: def __init__(self, storage_path: str, file_type: str = "t", makedirs: bool = False) -> None: # make it absolute path - self.storage_path = os.path.realpath(storage_path) # os.path.join(, '') + self.storage_path = os.path.realpath(storage_path) self.file_type = file_type if makedirs: os.makedirs(storage_path, exist_ok=True) @@ -243,7 +243,8 @@ def atomic_import( FileStorage.move_atomic_to_file(external_file_path, dest_file_path) ) - def in_storage(self, path: str) -> bool: + def is_path_in_storage(self, path: str) -> bool: + """Checks if a given path is below storage root, without checking for item existence""" assert path is not None # all paths are relative to root if not os.path.isabs(path): @@ -256,25 +257,30 @@ def in_storage(self, path: str) -> bool: def to_relative_path(self, path: str) -> str: if path == "": return "" - if not self.in_storage(path): + if not self.is_path_in_storage(path): raise ValueError(path) if not os.path.isabs(path): path = os.path.realpath(os.path.join(self.storage_path, path)) # for abs paths find the relative return os.path.relpath(path, start=self.storage_path) - def make_full_path(self, path: str) -> str: + def make_full_path_safe(self, path: str) -> str: + """Verifies that path is under storage root and then returns normalized absolute path""" # try to make a relative path if paths are absolute or overlapping path = self.to_relative_path(path) # then assume that it is a path relative to storage root return os.path.realpath(os.path.join(self.storage_path, path)) + def make_full_path(self, path: str) -> str: + """Joins path with storage root. Intended for path known to be relative to storage root""" + return os.path.join(self.storage_path, path) + def from_wd_to_relative_path(self, wd_relative_path: str) -> str: path = os.path.realpath(wd_relative_path) return self.to_relative_path(path) def from_relative_path_to_wd(self, relative_path: str) -> str: - return os.path.relpath(self.make_full_path(relative_path), start=".") + return os.path.relpath(self.make_full_path_safe(relative_path), start=".") @staticmethod def get_file_name_from_file_path(file_path: str) -> str: diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py index a21f0f2c0c..f419baed03 100644 --- a/dlt/common/storages/fsspec_filesystem.py +++ b/dlt/common/storages/fsspec_filesystem.py @@ -5,7 +5,6 @@ import pathlib import posixpath from io import BytesIO -from gzip import GzipFile from typing import ( Literal, cast, diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py index fd4ecc968e..1ecc491174 100644 --- a/dlt/common/storages/live_schema_storage.py +++ b/dlt/common/storages/live_schema_storage.py @@ -32,20 +32,6 @@ def remove_schema(self, name: str) -> None: # also remove the live schema self.live_schemas.pop(name, None) - def save_import_schema_if_not_exists(self, schema: Schema) -> bool: - """Saves import schema, if not exists. If schema was saved, link itself as imported from""" - if self.config.import_schema_path: - try: - self._load_import_schema(schema.name) - except FileNotFoundError: - # save import schema only if it not exist - self._export_schema(schema, self.config.import_schema_path) - # if import schema got saved then add own version hash as import version hash - schema._imported_version_hash = schema.version_hash - return True - - return False - def commit_live_schema(self, name: str) -> str: """Saves live schema in storage if it was modified""" if not self.is_live_schema_committed(name): diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index 4d72458e3e..9e3185221d 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -5,7 +5,7 @@ import datetime # noqa: 251 import humanize -from pathlib import Path +from pathlib import PurePath from pendulum.datetime import DateTime from typing import ( ClassVar, @@ -37,7 +37,12 @@ from dlt.common.schema import Schema, TSchemaTables from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TTableSchema from dlt.common.storages import FileStorage -from dlt.common.storages.exceptions import LoadPackageNotFound, CurrentLoadPackageStateNotAvailable +from dlt.common.storages.exceptions import ( + LoadPackageAlreadyCompleted, + LoadPackageNotCompleted, + LoadPackageNotFound, + CurrentLoadPackageStateNotAvailable, +) from dlt.common.typing import DictStrAny, SupportsHumanize from dlt.common.utils import flatten_list_or_items from dlt.common.versioned_state import ( @@ -52,6 +57,7 @@ TJobFileFormat = Literal["sql", "reference", TLoaderFileFormat] """Loader file formats with internal job types""" +JOB_EXCEPTION_EXTENSION = ".exception" class TPipelineStateDoc(TypedDict, total=False): @@ -61,9 +67,9 @@ class TPipelineStateDoc(TypedDict, total=False): engine_version: int pipeline_name: str state: str - version_hash: str created_at: datetime.datetime - dlt_load_id: NotRequired[str] + version_hash: str + _dlt_load_id: NotRequired[str] class TLoadPackageState(TVersionedState, total=False): @@ -165,7 +171,7 @@ def with_retry(self) -> "ParsedLoadJobFileName": @staticmethod def parse(file_name: str) -> "ParsedLoadJobFileName": - p = Path(file_name) + p = PurePath(file_name) parts = p.name.split(".") if len(parts) != 4: raise TerminalValueError(parts) @@ -319,13 +325,16 @@ def __init__(self, storage: FileStorage, initial_state: TLoadPackageStatus) -> N # def get_package_path(self, load_id: str) -> str: + """Gets path of the package relative to storage root""" return load_id - def get_job_folder_path(self, load_id: str, folder: TJobState) -> str: - return os.path.join(self.get_package_path(load_id), folder) + def get_job_state_folder_path(self, load_id: str, state: TJobState) -> str: + """Gets path to the jobs in `state` in package `load_id`, relative to the storage root""" + return os.path.join(self.get_package_path(load_id), state) - def get_job_file_path(self, load_id: str, folder: TJobState, file_name: str) -> str: - return os.path.join(self.get_job_folder_path(load_id, folder), file_name) + def get_job_file_path(self, load_id: str, state: TJobState, file_name: str) -> str: + """Get path to job with `file_name` in `state` in package `load_id`, relative to the storage root""" + return os.path.join(self.get_job_state_folder_path(load_id, state), file_name) def list_packages(self) -> Sequence[str]: """Lists all load ids in storage, earliest first @@ -338,29 +347,42 @@ def list_packages(self) -> Sequence[str]: def list_new_jobs(self, load_id: str) -> Sequence[str]: new_jobs = self.storage.list_folder_files( - self.get_job_folder_path(load_id, PackageStorage.NEW_JOBS_FOLDER) + self.get_job_state_folder_path(load_id, PackageStorage.NEW_JOBS_FOLDER) ) return new_jobs def list_started_jobs(self, load_id: str) -> Sequence[str]: return self.storage.list_folder_files( - self.get_job_folder_path(load_id, PackageStorage.STARTED_JOBS_FOLDER) + self.get_job_state_folder_path(load_id, PackageStorage.STARTED_JOBS_FOLDER) ) def list_failed_jobs(self, load_id: str) -> Sequence[str]: - return self.storage.list_folder_files( - self.get_job_folder_path(load_id, PackageStorage.FAILED_JOBS_FOLDER) - ) - - def list_jobs_for_table(self, load_id: str, table_name: str) -> Sequence[LoadJobInfo]: - return self.filter_jobs_for_table(self.list_all_jobs(load_id), table_name) - - def list_all_jobs(self, load_id: str) -> Sequence[LoadJobInfo]: - info = self.get_load_package_info(load_id) - return [job for job in flatten_list_or_items(iter(info.jobs.values()))] # type: ignore + return [ + file + for file in self.storage.list_folder_files( + self.get_job_state_folder_path(load_id, PackageStorage.FAILED_JOBS_FOLDER) + ) + if not file.endswith(JOB_EXCEPTION_EXTENSION) + ] + + def list_job_with_states_for_table( + self, load_id: str, table_name: str + ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + return self.filter_jobs_for_table(self.list_all_jobs_with_states(load_id), table_name) + + def list_all_jobs_with_states( + self, load_id: str + ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + info = self.get_load_package_jobs(load_id) + state_jobs = [] + for state, jobs in info.items(): + state_jobs.extend([(state, job) for job in jobs]) + return state_jobs def list_failed_jobs_infos(self, load_id: str) -> Sequence[LoadJobInfo]: """List all failed jobs and associated error messages for a load package with `load_id`""" + if not self.is_package_completed(load_id): + raise LoadPackageNotCompleted(load_id) failed_jobs: List[LoadJobInfo] = [] package_path = self.get_package_path(load_id) package_created_at = pendulum.from_timestamp( @@ -371,12 +393,19 @@ def list_failed_jobs_infos(self, load_id: str) -> Sequence[LoadJobInfo]: ) ) for file in self.list_failed_jobs(load_id): - if not file.endswith(".exception"): - failed_jobs.append( - self._read_job_file_info("failed_jobs", file, package_created_at) + failed_jobs.append( + self._read_job_file_info( + load_id, "failed_jobs", ParsedLoadJobFileName.parse(file), package_created_at ) + ) return failed_jobs + def is_package_completed(self, load_id: str) -> bool: + package_path = self.get_package_path(load_id) + return self.storage.has_file( + os.path.join(package_path, PackageStorage.PACKAGE_COMPLETED_FILE_NAME) + ) + # # Move jobs # @@ -385,7 +414,9 @@ def import_job( self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs" ) -> None: """Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`""" - self.storage.atomic_import(job_file_path, self.get_job_folder_path(load_id, job_state)) + self.storage.atomic_import( + job_file_path, self.get_job_state_folder_path(load_id, job_state) + ) def start_job(self, load_id: str, file_name: str) -> str: return self._move_job( @@ -397,7 +428,7 @@ def fail_job(self, load_id: str, file_name: str, failed_message: Optional[str]) if failed_message: self.storage.save( self.get_job_file_path( - load_id, PackageStorage.FAILED_JOBS_FOLDER, file_name + ".exception" + load_id, PackageStorage.FAILED_JOBS_FOLDER, file_name + JOB_EXCEPTION_EXTENSION ), failed_message, ) @@ -455,6 +486,8 @@ def create_package(self, load_id: str, initial_state: TLoadPackageState = None) def complete_loading_package(self, load_id: str, load_state: TLoadPackageStatus) -> str: """Completes loading the package by writing marker file with`package_state. Returns path to the completed package""" load_path = self.get_package_path(load_id) + if self.is_package_completed(load_id): + raise LoadPackageAlreadyCompleted(load_id) # save marker file self.storage.save( os.path.join(load_path, PackageStorage.PACKAGE_COMPLETED_FILE_NAME), load_state @@ -468,7 +501,7 @@ def remove_completed_jobs(self, load_id: str) -> None: # delete completed jobs if not has_failed_jobs: self.storage.delete_folder( - self.get_job_folder_path(load_id, PackageStorage.COMPLETED_JOBS_FOLDER), + self.get_job_state_folder_path(load_id, PackageStorage.COMPLETED_JOBS_FOLDER), recursively=True, ) @@ -533,11 +566,32 @@ def get_load_package_state_path(self, load_id: str) -> str: # Get package info # - def get_load_package_info(self, load_id: str) -> LoadPackageInfo: - """Gets information on normalized/completed package with given load_id, all jobs and their statuses.""" + def get_load_package_jobs(self, load_id: str) -> Dict[TJobState, List[ParsedLoadJobFileName]]: + """Gets all jobs in a package and returns them as lists assigned to a particular state.""" package_path = self.get_package_path(load_id) if not self.storage.has_folder(package_path): raise LoadPackageNotFound(load_id) + all_jobs: Dict[TJobState, List[ParsedLoadJobFileName]] = {} + for state in WORKING_FOLDERS: + jobs: List[ParsedLoadJobFileName] = [] + with contextlib.suppress(FileNotFoundError): + # we ignore if load package lacks one of working folders. completed_jobs may be deleted on archiving + for file in self.storage.list_folder_files( + self.get_job_state_folder_path(load_id, state), to_root=False + ): + if not file.endswith(JOB_EXCEPTION_EXTENSION): + jobs.append(ParsedLoadJobFileName.parse(file)) + all_jobs[state] = jobs + return all_jobs + + def get_load_package_info(self, load_id: str) -> LoadPackageInfo: + """Gets information on normalized/completed package with given load_id, all jobs and their statuses. + + Will reach to the file system to get additional stats, mtime, also collects exceptions for failed jobs. + NOTE: do not call this function often. it should be used only to generate metrics + """ + package_path = self.get_package_path(load_id) + package_jobs = self.get_load_package_jobs(load_id) package_created_at: DateTime = None package_state = self.initial_state @@ -560,15 +614,11 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: schema = Schema.from_dict(self._load_schema(load_id)) # read jobs with all statuses - all_jobs: Dict[TJobState, List[LoadJobInfo]] = {} - for state in WORKING_FOLDERS: - jobs: List[LoadJobInfo] = [] - with contextlib.suppress(FileNotFoundError): - # we ignore if load package lacks one of working folders. completed_jobs may be deleted on archiving - for file in self.storage.list_folder_files(os.path.join(package_path, state)): - if not file.endswith(".exception"): - jobs.append(self._read_job_file_info(state, file, package_created_at)) - all_jobs[state] = jobs + all_job_infos: Dict[TJobState, List[LoadJobInfo]] = {} + for state, jobs in package_jobs.items(): + all_job_infos[state] = [ + self._read_job_file_info(load_id, state, job, package_created_at) for job in jobs + ] return LoadPackageInfo( load_id, @@ -577,15 +627,46 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: schema, applied_update, package_created_at, - all_jobs, + all_job_infos, ) - def _read_job_file_info(self, state: TJobState, file: str, now: DateTime = None) -> LoadJobInfo: - try: - failed_message = self.storage.load(file + ".exception") - except FileNotFoundError: - failed_message = None - full_path = self.storage.make_full_path(file) + def get_job_failed_message(self, load_id: str, job: ParsedLoadJobFileName) -> str: + """Get exception message of a failed job.""" + rel_path = self.get_job_file_path(load_id, "failed_jobs", job.file_name()) + if not self.storage.has_file(rel_path): + raise FileNotFoundError(rel_path) + failed_message: str = None + with contextlib.suppress(FileNotFoundError): + failed_message = self.storage.load(rel_path + JOB_EXCEPTION_EXTENSION) + return failed_message + + def job_to_job_info( + self, load_id: str, state: TJobState, job: ParsedLoadJobFileName + ) -> LoadJobInfo: + """Creates partial job info by converting job object. size, mtime and failed message will not be populated""" + full_path = os.path.join( + self.storage.storage_path, self.get_job_file_path(load_id, state, job.file_name()) + ) + return LoadJobInfo( + state, + full_path, + 0, + None, + 0, + job, + None, + ) + + def _read_job_file_info( + self, load_id: str, state: TJobState, job: ParsedLoadJobFileName, now: DateTime = None + ) -> LoadJobInfo: + """Creates job info by reading additional props from storage""" + failed_message = None + if state == "failed_jobs": + failed_message = self.get_job_failed_message(load_id, job) + full_path = os.path.join( + self.storage.storage_path, self.get_job_file_path(load_id, state, job.file_name()) + ) st = os.stat(full_path) return LoadJobInfo( state, @@ -593,7 +674,7 @@ def _read_job_file_info(self, state: TJobState, file: str, now: DateTime = None) st.st_size, pendulum.from_timestamp(st.st_mtime), PackageStorage._job_elapsed_time_seconds(full_path, now.timestamp() if now else None), - ParsedLoadJobFileName.parse(file), + job, failed_message, ) @@ -611,10 +692,11 @@ def _move_job( ) -> str: # ensure we move file names, not paths assert file_name == FileStorage.get_file_name_from_file_path(file_name) - load_path = self.get_package_path(load_id) - dest_path = os.path.join(load_path, dest_folder, new_file_name or file_name) - self.storage.atomic_rename(os.path.join(load_path, source_folder, file_name), dest_path) - # print(f"{join(load_path, source_folder, file_name)} -> {dest_path}") + + dest_path = self.get_job_file_path(load_id, dest_folder, new_file_name or file_name) + self.storage.atomic_rename( + self.get_job_file_path(load_id, source_folder, file_name), dest_path + ) return self.storage.make_full_path(dest_path) def _load_schema(self, load_id: str) -> DictStrAny: @@ -659,9 +741,9 @@ def _job_elapsed_time_seconds(file_path: str, now_ts: float = None) -> float: @staticmethod def filter_jobs_for_table( - all_jobs: Iterable[LoadJobInfo], table_name: str - ) -> Sequence[LoadJobInfo]: - return [job for job in all_jobs if job.job_file_info.table_name == table_name] + all_jobs: Iterable[Tuple[TJobState, ParsedLoadJobFileName]], table_name: str + ) -> Sequence[Tuple[TJobState, ParsedLoadJobFileName]]: + return [job for job in all_jobs if job[1].table_name == table_name] @configspec diff --git a/dlt/common/storages/schema_storage.py b/dlt/common/storages/schema_storage.py index 1afed18929..0544de696f 100644 --- a/dlt/common/storages/schema_storage.py +++ b/dlt/common/storages/schema_storage.py @@ -5,7 +5,7 @@ from dlt.common.json import json from dlt.common.configuration import with_config from dlt.common.configuration.accessors import config -from dlt.common.schema.utils import to_pretty_json, to_pretty_yaml +from dlt.common.schema.utils import get_processing_hints, to_pretty_json, to_pretty_yaml from dlt.common.storages.configuration import ( SchemaStorageConfiguration, TSchemaFileFormat, @@ -57,6 +57,14 @@ def load_schema(self, name: str) -> Schema: return Schema.from_dict(storage_schema) def save_schema(self, schema: Schema) -> str: + """Saves schema to the storage and returns the path relative to storage. + + If import schema path is configured and import schema with schema.name exits, it + will be linked to `schema` via `_imported_version_hash`. Such hash is used in `load_schema` to + detect if import schema changed and thus to overwrite the storage schema. + + If export schema path is configured, `schema` will be exported to it. + """ # check if there's schema to import if self.config.import_schema_path: try: @@ -66,11 +74,25 @@ def save_schema(self, schema: Schema) -> str: except FileNotFoundError: # just save the schema pass - path = self._save_schema(schema) - if self.config.export_schema_path: - self._export_schema(schema, self.config.export_schema_path) + path = self._save_and_export_schema(schema) return path + def save_import_schema_if_not_exists(self, schema: Schema) -> bool: + """Saves import schema, if not exists. If schema was saved, link itself as imported from""" + if self.config.import_schema_path: + try: + self._load_import_schema(schema.name) + except FileNotFoundError: + # save import schema only if it not exist + self._export_schema( + schema, self.config.import_schema_path, remove_processing_hints=True + ) + # if import schema got saved then add own version hash as import version hash + schema._imported_version_hash = schema.version_hash + return True + + return False + def remove_schema(self, name: str) -> None: schema_file = self._file_name_in_store(name, "json") self.storage.delete(schema_file) @@ -116,25 +138,32 @@ def _maybe_import_schema(self, name: str, storage_schema: DictStrAny = None) -> f" {rv_schema._imported_version_hash}" ) # if schema was imported, overwrite storage schema - self._save_schema(rv_schema) - if self.config.export_schema_path: - self._export_schema(rv_schema, self.config.export_schema_path) + self._save_and_export_schema(rv_schema, check_processing_hints=True) else: # import schema when imported schema was modified from the last import rv_schema = Schema.from_dict(storage_schema) i_s = Schema.from_dict(imported_schema) if i_s.version_hash != rv_schema._imported_version_hash: + logger.warning( + f"Schema {name} was present in schema storage at" + f" {self.storage.storage_path} but will be overwritten with imported schema" + f" version {i_s.version} and imported hash {i_s.version_hash}" + ) + tables_seen_data = rv_schema.data_tables(seen_data_only=True) + if tables_seen_data: + logger.warning( + f"Schema {name} in schema storage contains tables" + f" ({', '.join(t['name'] for t in tables_seen_data)}) that are present" + " in the destination. If you changed schema of those tables in import" + " schema, consider using one of the refresh options:" + " https://dlthub.com/devel/general-usage/pipeline#refresh-pipeline-data-and-state" + ) + rv_schema.replace_schema_content(i_s, link_to_replaced_schema=True) rv_schema._imported_version_hash = i_s.version_hash - logger.info( - f"Schema {name} was present in {self.storage.storage_path} but is" - f" overwritten with imported schema version {i_s.version} and" - f" imported hash {i_s.version_hash}" - ) + # if schema was imported, overwrite storage schema - self._save_schema(rv_schema) - if self.config.export_schema_path: - self._export_schema(rv_schema, self.config.export_schema_path) + self._save_and_export_schema(rv_schema, check_processing_hints=True) except FileNotFoundError: # no schema to import -> skip silently and return the original if storage_schema is None: @@ -156,8 +185,13 @@ def _load_import_schema(self, name: str) -> DictStrAny: import_storage.load(schema_file), self.config.external_schema_format ) - def _export_schema(self, schema: Schema, export_path: str) -> None: - stored_schema = schema.to_dict(remove_defaults=True) + def _export_schema( + self, schema: Schema, export_path: str, remove_processing_hints: bool = False + ) -> None: + stored_schema = schema.to_dict( + remove_defaults=self.config.external_schema_format_remove_defaults, + remove_processing_hints=remove_processing_hints, + ) if self.config.external_schema_format == "json": exported_schema_s = to_pretty_json(stored_schema) elif self.config.external_schema_format == "yaml": @@ -175,7 +209,7 @@ def _export_schema(self, schema: Schema, export_path: str) -> None: ) def _save_schema(self, schema: Schema) -> str: - # save a schema to schema store + """Saves schema to schema store and bumps the version""" schema_file = self._file_name_in_store(schema.name, "json") stored_schema = schema.to_dict() saved_path = self.storage.save(schema_file, to_pretty_json(stored_schema)) @@ -184,16 +218,45 @@ def _save_schema(self, schema: Schema) -> str: schema._bump_version() return saved_path + def _save_and_export_schema(self, schema: Schema, check_processing_hints: bool = False) -> str: + """Saves schema to schema store and then exports it. If the export path is the same as import + path, processing hints will be removed. + """ + saved_path = self._save_schema(schema) + if self.config.export_schema_path: + self._export_schema( + schema, + self.config.export_schema_path, + self.config.export_schema_path == self.config.import_schema_path, + ) + # if any processing hints are found we should warn the user + if check_processing_hints and (processing_hints := get_processing_hints(schema.tables)): + msg = ( + f"Imported schema {schema.name} contains processing hints for some tables." + " Processing hints are used by normalizer (x-normalizer) to mark tables that got" + " materialized and that prevents destructive changes to the schema. In most cases" + " import schema should not contain processing hints because it is mostly used to" + " initialize tables in a new dataset. " + ) + msg += "Affected tables are: " + ", ".join(processing_hints.keys()) + logger.warning(msg) + return saved_path + @staticmethod def load_schema_file( - path: str, name: str, extensions: Tuple[TSchemaFileFormat, ...] = SchemaFileExtensions + path: str, + name: str, + extensions: Tuple[TSchemaFileFormat, ...] = SchemaFileExtensions, + remove_processing_hints: bool = False, ) -> Schema: storage = FileStorage(path) for extension in extensions: file = SchemaStorage._file_name_in_store(name, extension) if storage.has_file(file): parsed_schema = SchemaStorage._parse_schema_str(storage.load(file), extension) - schema = Schema.from_dict(parsed_schema) + schema = Schema.from_dict( + parsed_schema, remove_processing_hints=remove_processing_hints + ) if schema.name != name: raise UnexpectedSchemaName(name, path, schema.name) return schema diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 29c1b01d80..fdd27161f7 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -4,7 +4,7 @@ import os from re import Pattern as _REPattern import sys -from types import FunctionType, MethodType, ModuleType +from types import FunctionType from typing import ( ForwardRef, Callable, @@ -39,6 +39,7 @@ Concatenate, get_args, get_origin, + get_original_bases, ) try: @@ -105,6 +106,8 @@ VARIANT_FIELD_FORMAT = "v_%s" TFileOrPath = Union[str, PathLike, IO[Any]] TSortOrder = Literal["asc", "desc"] +TLoaderFileFormat = Literal["jsonl", "typed-jsonl", "insert_values", "parquet", "csv"] +"""known loader file formats""" class ConfigValueSentinel(NamedTuple): @@ -257,6 +260,25 @@ def is_literal_type(hint: Type[Any]) -> bool: return False +def get_literal_args(literal: Type[Any]) -> List[Any]: + """Recursively get arguments from nested Literal types and return an unified list.""" + if not hasattr(literal, "__origin__") or literal.__origin__ is not Literal: + raise ValueError("Provided type is not a Literal") + + unified_args = [] + + def _get_args(literal: Type[Any]) -> None: + for arg in get_args(literal): + if hasattr(arg, "__origin__") and arg.__origin__ is Literal: + _get_args(arg) + else: + unified_args.append(arg) + + _get_args(literal) + + return unified_args + + def is_newtype_type(t: Type[Any]) -> bool: if hasattr(t, "__supertype__"): return True @@ -362,7 +384,7 @@ def is_subclass(subclass: Any, cls: Any) -> bool: def get_generic_type_argument_from_instance( - instance: Any, sample_value: Optional[Any] + instance: Any, sample_value: Optional[Any] = None ) -> Type[Any]: """Infers type argument of a Generic class from an `instance` of that class using optional `sample_value` of the argument type @@ -376,8 +398,14 @@ def get_generic_type_argument_from_instance( Type[Any]: type argument or Any if not known """ orig_param_type = Any - if hasattr(instance, "__orig_class__"): - orig_param_type = get_args(instance.__orig_class__)[0] + if cls_ := getattr(instance, "__orig_class__", None): + # instance of generic class + pass + elif bases_ := get_original_bases(instance.__class__): + # instance of class deriving from generic + cls_ = bases_[0] + if cls_: + orig_param_type = get_args(cls_)[0] if orig_param_type is Any and sample_value is not None: orig_param_type = type(sample_value) return orig_param_type # type: ignore diff --git a/dlt/common/utils.py b/dlt/common/utils.py index cb2ec4c3d9..8e89556c39 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -13,6 +13,7 @@ from typing import ( Any, + Callable, ContextManager, Dict, MutableMapping, @@ -141,42 +142,6 @@ def flatten_list_of_str_or_dicts(seq: Sequence[Union[StrAny, str]]) -> DictStrAn return o -# def flatten_dicts_of_dicts(dicts: Mapping[str, Any]) -> Sequence[Any]: -# """ -# Transform and object {K: {...}, L: {...}...} -> [{key:K, ....}, {key: L, ...}, ...] -# """ -# o: List[Any] = [] -# for k, v in dicts.items(): -# if isinstance(v, list): -# # if v is a list then add "key" to each list element -# for lv in v: -# lv["key"] = k -# else: -# # add as "key" to dict -# v["key"] = k - -# o.append(v) -# return o - - -# def tuplify_list_of_dicts(dicts: Sequence[DictStrAny]) -> Sequence[DictStrAny]: -# """ -# Transform list of dictionaries with single key into single dictionary of {"key": orig_key, "value": orig_value} -# """ -# for d in dicts: -# if len(d) > 1: -# raise ValueError(f"Tuplify requires one key dicts {d}") -# if len(d) == 1: -# key = next(iter(d)) -# # delete key first to avoid name clashes -# value = d[key] -# del d[key] -# d["key"] = key -# d["value"] = value - -# return dicts - - def flatten_list_or_items(_iter: Union[Iterable[TAny], Iterable[List[TAny]]]) -> Iterator[TAny]: for items in _iter: if isinstance(items, List): @@ -503,11 +468,15 @@ def merge_row_counts(row_counts_1: RowCounts, row_counts_2: RowCounts) -> None: row_counts_1[counter_name] = row_counts_1.get(counter_name, 0) + row_counts_2[counter_name] -def extend_list_deduplicated(original_list: List[Any], extending_list: Iterable[Any]) -> List[Any]: +def extend_list_deduplicated( + original_list: List[Any], + extending_list: Iterable[Any], + normalize_f: Callable[[str], str] = str.__call__, +) -> List[Any]: """extends the first list by the second, but does not add duplicates""" - list_keys = set(original_list) + list_keys = set(normalize_f(s) for s in original_list) for item in extending_list: - if item not in list_keys: + if normalize_f(item) not in list_keys: original_list.append(item) return original_list diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 0a8bced287..8862c10024 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -7,6 +7,7 @@ from dlt.common.exceptions import DictValidationException from dlt.common.typing import ( StrAny, + get_literal_args, get_type_name, is_callable_type, is_literal_type, @@ -114,7 +115,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: failed_validations, ) elif is_literal_type(t): - a_l = get_args(t) + a_l = get_literal_args(t) if pv not in a_l: raise DictValidationException( f"field '{pk}' with value {pv} is not one of: {a_l}", path, t, pk, pv diff --git a/dlt/destinations/adapters.py b/dlt/destinations/adapters.py index 1c3e094e19..42d4879653 100644 --- a/dlt/destinations/adapters.py +++ b/dlt/destinations/adapters.py @@ -1,11 +1,11 @@ """This module collects all destination adapters present in `impl` namespace""" -from dlt.destinations.impl.weaviate import weaviate_adapter -from dlt.destinations.impl.qdrant import qdrant_adapter -from dlt.destinations.impl.bigquery import bigquery_adapter -from dlt.destinations.impl.synapse import synapse_adapter -from dlt.destinations.impl.clickhouse import clickhouse_adapter -from dlt.destinations.impl.athena import athena_adapter +from dlt.destinations.impl.weaviate.weaviate_adapter import weaviate_adapter +from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter +from dlt.destinations.impl.bigquery.bigquery_adapter import bigquery_adapter +from dlt.destinations.impl.synapse.synapse_adapter import synapse_adapter +from dlt.destinations.impl.clickhouse.clickhouse_adapter import clickhouse_adapter +from dlt.destinations.impl.athena.athena_adapter import athena_adapter, athena_partition __all__ = [ "weaviate_adapter", @@ -14,4 +14,5 @@ "synapse_adapter", "clickhouse_adapter", "athena_adapter", + "athena_partition", ] diff --git a/dlt/destinations/fs_client.py b/dlt/destinations/fs_client.py index 5153659614..3233446594 100644 --- a/dlt/destinations/fs_client.py +++ b/dlt/destinations/fs_client.py @@ -1,3 +1,4 @@ +import gzip from typing import Iterable, cast, Any, List from abc import ABC, abstractmethod from fsspec import AbstractFileSystem @@ -38,10 +39,19 @@ def read_bytes(self, path: str, start: Any = None, end: Any = None, **kwargs: An def read_text( self, path: str, - encoding: Any = None, + encoding: Any = "utf-8", errors: Any = None, newline: Any = None, + compression: str = None, **kwargs: Any ) -> str: - """reads given file into string""" - return cast(str, self.fs_client.read_text(path, encoding, errors, newline, **kwargs)) + """reads given file into string, tries gzip and pure text""" + if compression is None: + try: + return self.read_text(path, encoding, errors, newline, "gzip", **kwargs) + except (gzip.BadGzipFile, OSError): + pass + with self.fs_client.open( + path, mode="rt", compression=compression, encoding=encoding, newline=newline + ) as f: + return cast(str, f.read()) diff --git a/dlt/destinations/impl/athena/__init__.py b/dlt/destinations/impl/athena/__init__.py index 87a11f9f41..e69de29bb2 100644 --- a/dlt/destinations/impl/athena/__init__.py +++ b/dlt/destinations/impl/athena/__init__.py @@ -1,33 +0,0 @@ -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.data_writers.escape import ( - escape_athena_identifier, - format_bigquery_datetime_literal, -) -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - # athena only supports loading from staged files on s3 for now - caps.preferred_loader_file_format = None - caps.supported_loader_file_formats = [] - caps.supported_table_formats = ["iceberg"] - caps.preferred_staging_file_format = "parquet" - caps.supported_staging_file_formats = ["parquet", "jsonl"] - caps.escape_identifier = escape_athena_identifier - caps.format_datetime_literal = format_bigquery_datetime_literal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - caps.max_identifier_length = 255 - caps.max_column_identifier_length = 255 - caps.max_query_length = 16 * 1024 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 262144 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = False - caps.supports_transactions = False - caps.alter_add_multi_column = True - caps.schema_supports_numeric_precision = False - caps.timestamp_precision = 3 - caps.supports_truncate_command = False - return caps diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 60ea64a4e7..8d0ffb1d0c 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -34,21 +34,18 @@ from dlt.common import logger from dlt.common.exceptions import TerminalValueError from dlt.common.utils import without_none -from dlt.common.data_types import TDataType -from dlt.common.schema import TColumnSchema, Schema, TSchemaTables, TTableSchema +from dlt.common.schema import TColumnSchema, Schema, TTableSchema from dlt.common.schema.typing import ( TTableSchema, TColumnType, - TWriteDisposition, TTableFormat, TSortOrder, ) -from dlt.common.schema.utils import table_schema_has_type, get_table_format +from dlt.common.schema.utils import table_schema_has_type from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import LoadJob, DoNothingFollowupJob, DoNothingJob -from dlt.common.destination.reference import TLoadJobState, NewLoadJob, SupportsStagingDestination -from dlt.common.storages import FileStorage -from dlt.common.data_writers.escape import escape_bigquery_identifier +from dlt.common.destination.reference import NewLoadJob, SupportsStagingDestination +from dlt.common.data_writers.escape import escape_hive_identifier from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlMergeJob from dlt.destinations.typing import DBApi, DBTransaction @@ -58,7 +55,6 @@ DatabaseUndefinedRelation, LoadJobTerminalException, ) -from dlt.destinations.impl.athena import capabilities from dlt.destinations.sql_client import ( SqlClientBase, DBApiCursorImpl, @@ -221,11 +217,15 @@ def requires_temp_table_for_delete(cls) -> bool: class AthenaSQLClient(SqlClientBase[Connection]): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() dbapi: ClassVar[DBApi] = pyathena - def __init__(self, dataset_name: str, config: AthenaClientConfiguration) -> None: - super().__init__(None, dataset_name) + def __init__( + self, + dataset_name: str, + config: AthenaClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(None, dataset_name, capabilities) self._conn: Connection = None self.config = config self.credentials = config.credentials @@ -254,8 +254,9 @@ def escape_ddl_identifier(self, v: str) -> str: # Athena uses HIVE to create tables but for querying it uses PRESTO (so normal escaping) if not v: return v + v = self.capabilities.casefold_identifier(v) # bigquery uses hive escaping - return escape_bigquery_identifier(v) + return escape_hive_identifier(v) def fully_qualified_ddl_dataset_name(self) -> str: return self.escape_ddl_identifier(self.dataset_name) @@ -271,11 +272,6 @@ def create_dataset(self) -> None: def drop_dataset(self) -> None: self.execute_sql(f"DROP DATABASE {self.fully_qualified_ddl_dataset_name()} CASCADE;") - def fully_qualified_dataset_name(self, escape: bool = True) -> str: - return ( - self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name - ) - def drop_tables(self, *tables: str) -> None: if not tables: return @@ -366,17 +362,14 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB yield DBApiCursorImpl(cursor) # type: ignore - def has_dataset(self) -> bool: - # PRESTO escaping for queries - query = f"""SHOW DATABASES LIKE {self.fully_qualified_dataset_name()};""" - rows = self.execute_sql(query) - return len(rows) > 0 - class AthenaClient(SqlJobClientWithStaging, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: + def __init__( + self, + schema: Schema, + config: AthenaClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: # verify if staging layout is valid for Athena # this will raise if the table prefix is not properly defined # we actually that {table_name} is first, no {schema_name} is allowed @@ -386,7 +379,7 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: table_needs_own_folder=True, ) - sql_client = AthenaSQLClient(config.normalize_dataset_name(schema), config) + sql_client = AthenaSQLClient(config.normalize_dataset_name(schema), config, capabilities) super().__init__(schema, config, sql_client) self.sql_client: AthenaSQLClient = sql_client # type: ignore self.config: AthenaClientConfiguration = config diff --git a/dlt/destinations/impl/athena/factory.py b/dlt/destinations/impl/athena/factory.py index 5b37607cca..d4c29a641f 100644 --- a/dlt/destinations/impl/athena/factory.py +++ b/dlt/destinations/impl/athena/factory.py @@ -1,9 +1,14 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext -from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration from dlt.common.configuration.specs import AwsCredentials -from dlt.destinations.impl.athena import capabilities +from dlt.common.data_writers.escape import ( + escape_athena_identifier, + format_bigquery_datetime_literal, +) +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE + +from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration if t.TYPE_CHECKING: from dlt.destinations.impl.athena.athena import AthenaClient @@ -12,8 +17,36 @@ class athena(Destination[AthenaClientConfiguration, "AthenaClient"]): spec = AthenaClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + # athena only supports loading from staged files on s3 for now + caps.preferred_loader_file_format = None + caps.supported_loader_file_formats = [] + caps.supported_table_formats = ["iceberg"] + caps.preferred_staging_file_format = "parquet" + caps.supported_staging_file_formats = ["parquet", "jsonl"] + # athena is storing all identifiers in lower case and is case insensitive + # it also uses lower case in all the queries + # https://docs.aws.amazon.com/athena/latest/ug/tables-databases-columns-names.html + caps.escape_identifier = escape_athena_identifier + caps.casefold_identifier = str.lower + caps.has_case_sensitive_identifiers = False + caps.format_datetime_literal = format_bigquery_datetime_literal + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + caps.max_identifier_length = 255 + caps.max_column_identifier_length = 255 + caps.max_query_length = 16 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 262144 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = False + caps.supports_transactions = False + caps.alter_add_multi_column = True + caps.schema_supports_numeric_precision = False + caps.timestamp_precision = 3 + caps.supports_truncate_command = False + return caps @property def client_class(self) -> t.Type["AthenaClient"]: diff --git a/dlt/destinations/impl/bigquery/__init__.py b/dlt/destinations/impl/bigquery/__init__.py index 39322b43a0..e69de29bb2 100644 --- a/dlt/destinations/impl/bigquery/__init__.py +++ b/dlt/destinations/impl/bigquery/__init__.py @@ -1,31 +0,0 @@ -from dlt.common.data_writers.escape import ( - escape_bigquery_identifier, - format_bigquery_datetime_literal, -) -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["jsonl", "parquet"] - caps.preferred_staging_file_format = "parquet" - caps.supported_staging_file_formats = ["parquet", "jsonl"] - # BQ limit is 4GB but leave a large headroom since buffered writer does not preemptively check size - caps.recommended_file_size = int(1024 * 1024 * 1024) - caps.escape_identifier = escape_bigquery_identifier - caps.escape_literal = None - caps.format_datetime_literal = format_bigquery_datetime_literal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (76, 38) - caps.max_identifier_length = 1024 - caps.max_column_identifier_length = 300 - caps.max_query_length = 1024 * 1024 - caps.is_max_query_length_in_bytes = False - caps.max_text_data_type_length = 10 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = False - caps.supports_clone_table = True - - return caps diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index f26e6f42ee..c3a1be4174 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -1,7 +1,7 @@ import functools import os from pathlib import Path -from typing import Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type, cast +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, cast import google.cloud.bigquery as bigquery # noqa: I250 from google.api_core import exceptions as api_core_exceptions @@ -35,7 +35,6 @@ LoadJobNotExistsException, LoadJobTerminalException, ) -from dlt.destinations.impl.bigquery import capabilities from dlt.destinations.impl.bigquery.bigquery_adapter import ( PARTITION_HINT, CLUSTER_HINT, @@ -50,6 +49,7 @@ from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_jobs import SqlMergeJob from dlt.destinations.type_mapping import TypeMapper +from dlt.destinations.utils import parse_db_data_type_str_with_precision from dlt.pipeline.current import destination_state @@ -58,10 +58,10 @@ class BigQueryTypeMapper(TypeMapper): "complex": "JSON", "text": "STRING", "double": "FLOAT64", - "bool": "BOOLEAN", + "bool": "BOOL", "date": "DATE", "timestamp": "TIMESTAMP", - "bigint": "INTEGER", + "bigint": "INT64", "binary": "BYTES", "wei": "BIGNUMERIC", # non-parametrized should hold wei values "time": "TIME", @@ -74,11 +74,11 @@ class BigQueryTypeMapper(TypeMapper): dbt_to_sct = { "STRING": "text", - "FLOAT": "double", - "BOOLEAN": "bool", + "FLOAT64": "double", + "BOOL": "bool", "DATE": "date", "TIMESTAMP": "timestamp", - "INTEGER": "bigint", + "INT64": "bigint", "BYTES": "binary", "NUMERIC": "decimal", "BIGNUMERIC": "decimal", @@ -97,9 +97,10 @@ def to_db_decimal_type(self, precision: Optional[int], scale: Optional[int]) -> def from_db_type( self, db_type: str, precision: Optional[int], scale: Optional[int] ) -> TColumnType: - if db_type == "BIGNUMERIC" and precision is None: + # precision is present in the type name + if db_type == "BIGNUMERIC": return dict(data_type="wei") - return super().from_db_type(db_type, precision, scale) + return super().from_db_type(*parse_db_data_type_str_with_precision(db_type)) class BigQueryLoadJob(LoadJob, FollowupJob): @@ -173,12 +174,16 @@ def gen_key_table_clauses( class BigQueryClient(SqlJobClientWithStaging, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: BigQueryClientConfiguration) -> None: + def __init__( + self, + schema: Schema, + config: BigQueryClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: sql_client = BigQuerySqlClient( config.normalize_dataset_name(schema), config.credentials, + capabilities, config.get_location(), config.http_timeout, config.retry_deadline, @@ -266,7 +271,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> reason = BigQuerySqlClient._get_reason_from_errors(gace) if reason == "notFound": # google.api_core.exceptions.NotFound: 404 – table not found - raise UnknownTableException(table["name"]) from gace + raise UnknownTableException(self.schema.name, table["name"]) from gace elif ( reason == "duplicate" ): # google.api_core.exceptions.Conflict: 409 PUT – already exists @@ -292,15 +297,15 @@ def _get_table_update_sql( c for c in new_columns if c.get("partition") or c.get(PARTITION_HINT, False) ]: if len(partition_list) > 1: - col_names = [self.capabilities.escape_identifier(c["name"]) for c in partition_list] + col_names = [self.sql_client.escape_column_name(c["name"]) for c in partition_list] raise DestinationSchemaWillNotUpdate( canonical_name, col_names, "Partition requested for more than one column" ) elif (c := partition_list[0])["data_type"] == "date": - sql[0] += f"\nPARTITION BY {self.capabilities.escape_identifier(c['name'])}" + sql[0] += f"\nPARTITION BY {self.sql_client.escape_column_name(c['name'])}" elif (c := partition_list[0])["data_type"] == "timestamp": sql[0] = ( - f"{sql[0]}\nPARTITION BY DATE({self.capabilities.escape_identifier(c['name'])})" + f"{sql[0]}\nPARTITION BY DATE({self.sql_client.escape_column_name(c['name'])})" ) # Automatic partitioning of an INT64 type requires us to be prescriptive - we treat the column as a UNIX timestamp. # This is due to the bounds requirement of GENERATE_ARRAY function for partitioning. @@ -309,12 +314,12 @@ def _get_table_update_sql( # See: https://dlthub.com/devel/dlt-ecosystem/destinations/bigquery#supported-column-hints elif (c := partition_list[0])["data_type"] == "bigint": sql[0] += ( - f"\nPARTITION BY RANGE_BUCKET({self.capabilities.escape_identifier(c['name'])}," + f"\nPARTITION BY RANGE_BUCKET({self.sql_client.escape_column_name(c['name'])}," " GENERATE_ARRAY(-172800000, 691200000, 86400))" ) if cluster_list := [ - self.capabilities.escape_identifier(c["name"]) + self.sql_client.escape_column_name(c["name"]) for c in new_columns if c.get("cluster") or c.get(CLUSTER_HINT, False) ]: @@ -365,8 +370,57 @@ def prepare_load_table( ) return table + def get_storage_tables( + self, table_names: Iterable[str] + ) -> Iterable[Tuple[str, TTableSchemaColumns]]: + """Gets table schemas from BigQuery using INFORMATION_SCHEMA or get_table for hidden datasets""" + if not self.sql_client.is_hidden_dataset: + return super().get_storage_tables(table_names) + + # use the api to get storage tables for hidden dataset + schema_tables: List[Tuple[str, TTableSchemaColumns]] = [] + for table_name in table_names: + try: + schema_table: TTableSchemaColumns = {} + table = self.sql_client.native_connection.get_table( + self.sql_client.make_qualified_table_name(table_name, escape=False), + retry=self.sql_client._default_retry, + timeout=self.config.http_timeout, + ) + for c in table.schema: + schema_c: TColumnSchema = { + "name": c.name, + "nullable": c.is_nullable, + **self._from_db_type(c.field_type, c.precision, c.scale), + } + schema_table[c.name] = schema_c + schema_tables.append((table_name, schema_table)) + except gcp_exceptions.NotFound: + # table is not present + schema_tables.append((table_name, {})) + return schema_tables + + def _get_info_schema_columns_query( + self, catalog_name: Optional[str], schema_name: str, folded_table_names: List[str] + ) -> Tuple[str, List[Any]]: + """Bigquery needs to scope the INFORMATION_SCHEMA.COLUMNS with project and dataset name so standard query generator cannot be used.""" + # escape schema and catalog names + catalog_name = self.capabilities.escape_identifier(catalog_name) + schema_name = self.capabilities.escape_identifier(schema_name) + + query = f""" +SELECT {",".join(self._get_storage_table_query_columns())} + FROM {catalog_name}.{schema_name}.INFORMATION_SCHEMA.COLUMNS +WHERE """ + + # placeholder for each table + table_placeholders = ",".join(["%s"] * len(folded_table_names)) + query += f"table_name IN ({table_placeholders}) ORDER BY table_name, ordinal_position;" + + return query, folded_table_names + def _get_column_def_sql(self, column: TColumnSchema, table_format: TTableFormat = None) -> str: - name = self.capabilities.escape_identifier(column["name"]) + name = self.sql_client.escape_column_name(column["name"]) column_def_sql = ( f"{name} {self.type_mapper.to_db_type(column, table_format)} {self._gen_not_null(column.get('nullable', True))}" ) @@ -376,32 +430,6 @@ def _get_column_def_sql(self, column: TColumnSchema, table_format: TTableFormat column_def_sql += " OPTIONS (rounding_mode='ROUND_HALF_AWAY_FROM_ZERO')" return column_def_sql - def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: - schema_table: TTableSchemaColumns = {} - try: - table = self.sql_client.native_connection.get_table( - self.sql_client.make_qualified_table_name(table_name, escape=False), - retry=self.sql_client._default_retry, - timeout=self.config.http_timeout, - ) - partition_field = table.time_partitioning.field if table.time_partitioning else None - for c in table.schema: - schema_c: TColumnSchema = { - "name": c.name, - "nullable": c.is_nullable, - "unique": False, - "sort": False, - "primary_key": False, - "foreign_key": False, - "cluster": c.name in (table.clustering_fields or []), - "partition": c.name == partition_field, - **self._from_db_type(c.field_type, c.precision, c.scale), - } - schema_table[c.name] = schema_c - return True, schema_table - except gcp_exceptions.NotFound: - return False, schema_table - def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.LoadJob: # append to table for merge loads (append to stage) and regular appends. table_name = table["name"] diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index f69e85ca3d..0e2403f7d9 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -14,6 +14,7 @@ class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_type: Final[str] = dataclasses.field(default="bigquery", init=False, repr=False, compare=False) # type: ignore credentials: GcpServiceAccountCredentials = None location: str = "US" + has_case_sensitive_identifiers: bool = True http_timeout: float = 15.0 # connection timeout for http request to BigQuery api file_upload_timeout: float = 30 * 60.0 # a timeout for file upload when loading local files diff --git a/dlt/destinations/impl/bigquery/factory.py b/dlt/destinations/impl/bigquery/factory.py index bee55fa164..db61a6042a 100644 --- a/dlt/destinations/impl/bigquery/factory.py +++ b/dlt/destinations/impl/bigquery/factory.py @@ -1,10 +1,13 @@ import typing as t -from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration +from dlt.common.normalizers.naming import NamingConvention from dlt.common.configuration.specs import GcpServiceAccountCredentials -from dlt.destinations.impl.bigquery import capabilities +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE +from dlt.common.data_writers.escape import escape_hive_identifier, format_bigquery_datetime_literal from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration + if t.TYPE_CHECKING: from dlt.destinations.impl.bigquery.bigquery import BigQueryClient @@ -13,8 +16,34 @@ class bigquery(Destination[BigQueryClientConfiguration, "BigQueryClient"]): spec = BigQueryClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl", "parquet"] + caps.preferred_staging_file_format = "parquet" + caps.supported_staging_file_formats = ["parquet", "jsonl"] + # BigQuery is by default case sensitive but that cannot be turned off for a dataset + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity + caps.escape_identifier = escape_hive_identifier + caps.escape_literal = None + caps.has_case_sensitive_identifiers = True + caps.casefold_identifier = str + # BQ limit is 4GB but leave a large headroom since buffered writer does not preemptively check size + caps.recommended_file_size = int(1024 * 1024 * 1024) + caps.format_datetime_literal = format_bigquery_datetime_literal + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (76, 38) + caps.max_identifier_length = 1024 + caps.max_column_identifier_length = 300 + caps.max_query_length = 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 10 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = False + caps.supports_clone_table = True + caps.schema_supports_numeric_precision = False # no precision information in BigQuery + + return caps @property def client_class(self) -> t.Type["BigQueryClient"]: @@ -26,14 +55,38 @@ def __init__( self, credentials: t.Optional[GcpServiceAccountCredentials] = None, location: t.Optional[str] = None, + has_case_sensitive_identifiers: bool = None, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, ) -> None: + """Configure the MsSql destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the mssql database. Can be an instance of `GcpServiceAccountCredentials` or + a dict or string with service accounts credentials as used in the Google Cloud + location: A location where the datasets will be created, eg. "EU". The default is "US" + has_case_sensitive_identifiers: Is the dataset case-sensitive, defaults to True + **kwargs: Additional arguments passed to the destination config + """ super().__init__( credentials=credentials, location=location, + has_case_sensitive_identifiers=has_case_sensitive_identifiers, destination_name=destination_name, environment=environment, **kwargs, ) + + @classmethod + def adjust_capabilities( + cls, + caps: DestinationCapabilitiesContext, + config: BigQueryClientConfiguration, + naming: t.Optional[NamingConvention], + ) -> DestinationCapabilitiesContext: + # modify the caps if case sensitive identifiers are requested + caps.has_case_sensitive_identifiers = config.has_case_sensitive_identifiers + return super().adjust_capabilities(caps, config, naming) diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py index 21086a4db6..45e9379af5 100644 --- a/dlt/destinations/impl/bigquery/sql_client.py +++ b/dlt/destinations/impl/bigquery/sql_client.py @@ -1,5 +1,5 @@ from contextlib import contextmanager -from typing import Any, AnyStr, ClassVar, Iterator, List, Optional, Sequence +from typing import Any, AnyStr, ClassVar, Iterator, List, Optional, Sequence, Generator import google.cloud.bigquery as bigquery # noqa: I250 from google.api_core import exceptions as api_core_exceptions @@ -8,6 +8,7 @@ from google.cloud.bigquery.dbapi import Connection as DbApiConnection, Cursor as BQDbApiCursor from google.cloud.bigquery.dbapi import exceptions as dbapi_exceptions +from dlt.common import logger from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.typing import StrAny @@ -16,7 +17,6 @@ DatabaseTransientException, DatabaseUndefinedRelation, ) -from dlt.destinations.impl.bigquery import capabilities from dlt.destinations.sql_client import ( DBApiCursorImpl, SqlClientBase, @@ -44,29 +44,42 @@ class BigQueryDBApiCursorImpl(DBApiCursorImpl): """Use native BigQuery data frame support if available""" native_cursor: BQDbApiCursor # type: ignore + df_iterator: Generator[Any, None, None] - def df(self, chunk_size: int = None, **kwargs: Any) -> DataFrame: - if chunk_size is not None: - return super().df(chunk_size=chunk_size) + def __init__(self, curr: DBApiCursor) -> None: + super().__init__(curr) + self.df_iterator = None + + def df(self, chunk_size: Optional[int] = None, **kwargs: Any) -> DataFrame: query_job: bigquery.QueryJob = getattr( self.native_cursor, "_query_job", self.native_cursor.query_job ) - + if self.df_iterator: + return next(self.df_iterator, None) try: + if chunk_size is not None: + # create iterator with given page size + self.df_iterator = query_job.result(page_size=chunk_size).to_dataframe_iterable() + return next(self.df_iterator, None) return query_job.to_dataframe(**kwargs) - except ValueError: + except ValueError as ex: # no pyarrow/db-types, fallback to our implementation - return super().df() + logger.warning(f"Native BigQuery pandas reader could not be used: {str(ex)}") + return super().df(chunk_size=chunk_size) + + def close(self) -> None: + if self.df_iterator: + self.df_iterator.close() class BigQuerySqlClient(SqlClientBase[bigquery.Client], DBTransaction): dbapi: ClassVar[DBApi] = bq_dbapi - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__( self, dataset_name: str, credentials: GcpServiceAccountCredentialsWithoutDefaults, + capabilities: DestinationCapabilitiesContext, location: str = "US", http_timeout: float = 15.0, retry_deadline: float = 60.0, @@ -75,7 +88,7 @@ def __init__( self.credentials: GcpServiceAccountCredentialsWithoutDefaults = credentials self.location = location self.http_timeout = http_timeout - super().__init__(credentials.project_id, dataset_name) + super().__init__(credentials.project_id, dataset_name, capabilities) self._default_retry = bigquery.DEFAULT_RETRY.with_deadline(retry_deadline) self._default_query = bigquery.QueryJobConfig( @@ -177,8 +190,11 @@ def has_dataset(self) -> bool: return False def create_dataset(self) -> None: + dataset = bigquery.Dataset(self.fully_qualified_dataset_name(escape=False)) + dataset.location = self.location + dataset.is_case_insensitive = not self.capabilities.has_case_sensitive_identifiers self._client.create_dataset( - self.fully_qualified_dataset_name(escape=False), + dataset, retry=self._default_retry, timeout=self.http_timeout, ) @@ -221,14 +237,19 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB # will close all cursors conn.close() - def fully_qualified_dataset_name(self, escape: bool = True) -> str: + def catalog_name(self, escape: bool = True) -> Optional[str]: + project_id = self.capabilities.casefold_identifier(self.credentials.project_id) if escape: - project_id = self.capabilities.escape_identifier(self.credentials.project_id) - dataset_name = self.capabilities.escape_identifier(self.dataset_name) - else: - project_id = self.credentials.project_id - dataset_name = self.dataset_name - return f"{project_id}.{dataset_name}" + project_id = self.capabilities.escape_identifier(project_id) + return project_id + + @property + def is_hidden_dataset(self) -> bool: + """Tells if the dataset associated with sql_client is a hidden dataset. + + Hidden datasets are not present in information schema. + """ + return self.dataset_name.startswith("_") @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: diff --git a/dlt/destinations/impl/clickhouse/__init__.py b/dlt/destinations/impl/clickhouse/__init__.py index bead136828..e69de29bb2 100644 --- a/dlt/destinations/impl/clickhouse/__init__.py +++ b/dlt/destinations/impl/clickhouse/__init__.py @@ -1,53 +0,0 @@ -import sys - -from dlt.common.pendulum import pendulum -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.common.data_writers.escape import ( - escape_clickhouse_identifier, - escape_clickhouse_literal, - format_clickhouse_datetime_literal, -) -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.destinations.impl.clickhouse.clickhouse_adapter import clickhouse_adapter - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["parquet", "jsonl"] - caps.preferred_staging_file_format = "jsonl" - caps.supported_staging_file_formats = ["parquet", "jsonl"] - - caps.format_datetime_literal = format_clickhouse_datetime_literal - caps.escape_identifier = escape_clickhouse_identifier - caps.escape_literal = escape_clickhouse_literal - - # https://stackoverflow.com/questions/68358686/what-is-the-maximum-length-of-a-column-in-clickhouse-can-it-be-modified - caps.max_identifier_length = 255 - caps.max_column_identifier_length = 255 - - # ClickHouse has no max `String` type length. - caps.max_text_data_type_length = sys.maxsize - - caps.schema_supports_numeric_precision = True - # Use 'Decimal128' with these defaults. - # https://clickhouse.com/docs/en/sql-reference/data-types/decimal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - # Use 'Decimal256' with these defaults. - caps.wei_precision = (76, 0) - caps.timestamp_precision = 6 - - # https://clickhouse.com/docs/en/operations/settings/settings#max_query_size - caps.is_max_query_length_in_bytes = True - caps.max_query_length = 262144 - - # ClickHouse has limited support for transactional semantics, especially for `ReplicatedMergeTree`, - # the default ClickHouse Cloud engine. It does, however, provide atomicity for individual DDL operations like `ALTER TABLE`. - # https://clickhouse-driver.readthedocs.io/en/latest/dbapi.html#clickhouse_driver.dbapi.connection.Connection.commit - # https://clickhouse.com/docs/en/guides/developer/transactional#transactions-commit-and-rollback - caps.supports_transactions = False - caps.supports_ddl_transactions = False - - caps.supports_truncate_command = True - - return caps diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index cf1f1bc857..6dd8fd47ed 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -36,7 +36,6 @@ ) from dlt.common.storages import FileStorage from dlt.destinations.exceptions import LoadJobTerminalException -from dlt.destinations.impl.clickhouse import capabilities from dlt.destinations.impl.clickhouse.clickhouse_adapter import ( TTableEngineType, TABLE_ENGINE_TYPE_HINT, @@ -289,15 +288,14 @@ def requires_temp_table_for_delete(cls) -> bool: class ClickHouseClient(SqlJobClientWithStaging, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def __init__( self, schema: Schema, config: ClickHouseClientConfiguration, + capabilities: DestinationCapabilitiesContext, ) -> None: self.sql_client: ClickHouseSqlClient = ClickHouseSqlClient( - config.normalize_dataset_name(schema), config.credentials + config.normalize_dataset_name(schema), config.credentials, capabilities ) super().__init__(schema, config, self.sql_client) self.config: ClickHouseClientConfiguration = config @@ -327,7 +325,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non ) return ( - f"{self.capabilities.escape_identifier(c['name'])} {type_with_nullability_modifier} {hints_str}" + f"{self.sql_client.escape_column_name(c['name'])} {type_with_nullability_modifier} {hints_str}" .strip() ) @@ -357,7 +355,7 @@ def _get_table_update_sql( sql[0] = f"{sql[0]}\nENGINE = {TABLE_ENGINE_TYPE_TO_CLICKHOUSE_ATTR.get(table_type)}" if primary_key_list := [ - self.capabilities.escape_identifier(c["name"]) + self.sql_client.escape_column_name(c["name"]) for c in new_columns if c.get("primary_key") ]: @@ -367,34 +365,6 @@ def _get_table_update_sql( return sql - def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: - fields = self._get_storage_table_query_columns() - db_params = self.sql_client.make_qualified_table_name(table_name, escape=False).split( - ".", 3 - ) - query = f'SELECT {",".join(fields)} FROM INFORMATION_SCHEMA.COLUMNS WHERE ' - if len(db_params) == 3: - query += "table_catalog = %s AND " - query += "table_schema = %s AND table_name = %s ORDER BY ordinal_position;" - rows = self.sql_client.execute_sql(query, *db_params) - - # If no rows we assume that table does not exist. - schema_table: TTableSchemaColumns = {} - if len(rows) == 0: - return False, schema_table - for c in rows: - numeric_precision = ( - c[3] if self.capabilities.schema_supports_numeric_precision else None - ) - numeric_scale = c[4] if self.capabilities.schema_supports_numeric_precision else None - schema_c: TColumnSchemaBase = { - "name": c[0], - "nullable": bool(c[2]), - **self._from_db_type(c[1], numeric_precision, numeric_scale), - } - schema_table[c[0]] = schema_c # type: ignore - return True, schema_table - @staticmethod def _gen_not_null(v: bool) -> str: # ClickHouse fields are not nullable by default. diff --git a/dlt/destinations/impl/clickhouse/factory.py b/dlt/destinations/impl/clickhouse/factory.py index e5b8fc0e6a..52a1694dee 100644 --- a/dlt/destinations/impl/clickhouse/factory.py +++ b/dlt/destinations/impl/clickhouse/factory.py @@ -1,7 +1,14 @@ +import sys import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext -from dlt.destinations.impl.clickhouse import capabilities +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE +from dlt.common.data_writers.escape import ( + escape_clickhouse_identifier, + escape_clickhouse_literal, + format_clickhouse_datetime_literal, +) + from dlt.destinations.impl.clickhouse.configuration import ( ClickHouseClientConfiguration, ClickHouseCredentials, @@ -16,8 +23,51 @@ class clickhouse(Destination[ClickHouseClientConfiguration, "ClickHouseClient"]): spec = ClickHouseClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["parquet", "jsonl"] + caps.preferred_staging_file_format = "jsonl" + caps.supported_staging_file_formats = ["parquet", "jsonl"] + + caps.format_datetime_literal = format_clickhouse_datetime_literal + caps.escape_identifier = escape_clickhouse_identifier + caps.escape_literal = escape_clickhouse_literal + # docs are very unclear https://clickhouse.com/docs/en/sql-reference/syntax + # taking into account other sources: identifiers are case sensitive + caps.has_case_sensitive_identifiers = True + # and store as is in the information schema + caps.casefold_identifier = str + + # https://stackoverflow.com/questions/68358686/what-is-the-maximum-length-of-a-column-in-clickhouse-can-it-be-modified + caps.max_identifier_length = 255 + caps.max_column_identifier_length = 255 + + # ClickHouse has no max `String` type length. + caps.max_text_data_type_length = sys.maxsize + + caps.schema_supports_numeric_precision = True + # Use 'Decimal128' with these defaults. + # https://clickhouse.com/docs/en/sql-reference/data-types/decimal + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + # Use 'Decimal256' with these defaults. + caps.wei_precision = (76, 0) + caps.timestamp_precision = 6 + + # https://clickhouse.com/docs/en/operations/settings/settings#max_query_size + caps.is_max_query_length_in_bytes = True + caps.max_query_length = 262144 + + # ClickHouse has limited support for transactional semantics, especially for `ReplicatedMergeTree`, + # the default ClickHouse Cloud engine. It does, however, provide atomicity for individual DDL operations like `ALTER TABLE`. + # https://clickhouse-driver.readthedocs.io/en/latest/dbapi.html#clickhouse_driver.dbapi.connection.Connection.commit + # https://clickhouse.com/docs/en/guides/developer/transactional#transactions-commit-and-rollback + caps.supports_transactions = False + caps.supports_ddl_transactions = False + + caps.supports_truncate_command = True + + return caps @property def client_class(self) -> t.Type["ClickHouseClient"]: diff --git a/dlt/destinations/impl/clickhouse/sql_client.py b/dlt/destinations/impl/clickhouse/sql_client.py index 8fb89c90cd..ee013ea123 100644 --- a/dlt/destinations/impl/clickhouse/sql_client.py +++ b/dlt/destinations/impl/clickhouse/sql_client.py @@ -7,6 +7,7 @@ Optional, Sequence, ClassVar, + Tuple, ) import clickhouse_driver # type: ignore[import-untyped] @@ -20,7 +21,6 @@ DatabaseTransientException, DatabaseTerminalException, ) -from dlt.destinations.impl.clickhouse import capabilities from dlt.destinations.impl.clickhouse.configuration import ClickHouseCredentials from dlt.destinations.sql_client import ( DBApiCursorImpl, @@ -45,15 +45,20 @@ class ClickHouseSqlClient( SqlClientBase[clickhouse_driver.dbapi.connection.Connection], DBTransaction ): dbapi: ClassVar[DBApi] = clickhouse_driver.dbapi - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def __init__(self, dataset_name: str, credentials: ClickHouseCredentials) -> None: - super().__init__(credentials.database, dataset_name) + def __init__( + self, + dataset_name: str, + credentials: ClickHouseCredentials, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(credentials.database, dataset_name, capabilities) self._conn: clickhouse_driver.dbapi.connection = None self.credentials = credentials self.database_name = credentials.database def has_dataset(self) -> bool: + # we do not need to normalize dataset_sentinel_table_name sentinel_table = self.credentials.dataset_sentinel_table_name return sentinel_table in [ t.split(self.credentials.dataset_table_separator)[1] for t in self._list_tables() @@ -110,10 +115,11 @@ def drop_dataset(self) -> None: # This is because the driver incorrectly substitutes the entire query string, causing the "DROP TABLE" keyword to be omitted. # To resolve this, we are forced to provide the full query string here. self.execute_sql( - f"""DROP TABLE {self.capabilities.escape_identifier(self.database_name)}.{self.capabilities.escape_identifier(table)} SYNC""" + f"""DROP TABLE {self.catalog_name()}.{self.capabilities.escape_identifier(table)} SYNC""" ) def _list_tables(self) -> List[str]: + catalog_name, table_name = self.make_qualified_table_name_path("%", escape=False) rows = self.execute_sql( """ SELECT name @@ -121,10 +127,8 @@ def _list_tables(self) -> List[str]: WHERE database = %s AND name LIKE %s """, - ( - self.database_name, - f"{self.dataset_name}{self.credentials.dataset_table_separator}%", - ), + catalog_name, + table_name, ) return [row[0] for row in rows] @@ -151,21 +155,33 @@ def execute_query( yield ClickHouseDBApiCursorImpl(cursor) # type: ignore[abstract] - def fully_qualified_dataset_name(self, escape: bool = True) -> str: - database_name = self.database_name - dataset_name = self.dataset_name - if escape: - database_name = self.capabilities.escape_identifier(database_name) - dataset_name = self.capabilities.escape_identifier(dataset_name) - return f"{database_name}.{dataset_name}" - - def make_qualified_table_name(self, table_name: str, escape: bool = True) -> str: - database_name = self.database_name - table_name = f"{self.dataset_name}{self.credentials.dataset_table_separator}{table_name}" + def catalog_name(self, escape: bool = True) -> Optional[str]: + database_name = self.capabilities.casefold_identifier(self.database_name) if escape: database_name = self.capabilities.escape_identifier(database_name) - table_name = self.capabilities.escape_identifier(table_name) - return f"{database_name}.{table_name}" + return database_name + + def make_qualified_table_name_path( + self, table_name: Optional[str], escape: bool = True + ) -> List[str]: + # get catalog and dataset + path = super().make_qualified_table_name_path(None, escape=escape) + if table_name: + # table name combines dataset name and table name + table_name = self.capabilities.casefold_identifier( + f"{self.dataset_name}{self.credentials.dataset_table_separator}{table_name}" + ) + if escape: + table_name = self.capabilities.escape_identifier(table_name) + # we have only two path components + path[1] = table_name + return path + + def _get_information_schema_components(self, *tables: str) -> Tuple[str, str, List[str]]: + components = super()._get_information_schema_components(*tables) + # clickhouse has a catalogue and no schema but uses catalogue as a schema to query the information schema 🤷 + # so we must disable catalogue search. also note that table name is prefixed with logical "dataset_name" + return (None, components[0], components[2]) @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: diff --git a/dlt/destinations/impl/databricks/__init__.py b/dlt/destinations/impl/databricks/__init__.py index 81884fae4b..e69de29bb2 100644 --- a/dlt/destinations/impl/databricks/__init__.py +++ b/dlt/destinations/impl/databricks/__init__.py @@ -1,30 +0,0 @@ -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.data_writers.escape import escape_databricks_identifier, escape_databricks_literal -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE - -from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = None - caps.supported_loader_file_formats = [] - caps.preferred_staging_file_format = "parquet" - caps.supported_staging_file_formats = ["jsonl", "parquet"] - caps.escape_identifier = escape_databricks_identifier - caps.escape_literal = escape_databricks_literal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - caps.max_identifier_length = 255 - caps.max_column_identifier_length = 255 - caps.max_query_length = 2 * 1024 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 16 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = False - caps.supports_truncate_command = True - # caps.supports_transactions = False - caps.alter_add_multi_column = True - caps.supports_multiple_statements = False - caps.supports_clone_table = True - return caps diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index cd203e7e4d..62debdedb7 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -1,6 +1,7 @@ from typing import ClassVar, Dict, Optional, Sequence, Tuple, List, Any, Iterable, Type, cast from urllib.parse import urlparse, urlunparse +from dlt import config from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( FollowupJob, @@ -15,27 +16,22 @@ AzureCredentials, AzureCredentialsWithoutDefaults, ) -from dlt.common.data_types import TDataType from dlt.common.exceptions import TerminalValueError from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns from dlt.common.schema.typing import TTableSchema, TColumnType, TSchemaTables, TTableFormat from dlt.common.schema.utils import table_schema_has_type +from dlt.common.storages import FilesystemConfiguration, fsspec_from_config from dlt.destinations.insert_job_client import InsertValuesJobClient from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.exceptions import LoadJobTerminalException - -from dlt.destinations.impl.databricks import capabilities from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration from dlt.destinations.impl.databricks.sql_client import DatabricksSqlClient -from dlt.destinations.sql_jobs import SqlMergeJob, SqlJobParams +from dlt.destinations.sql_jobs import SqlMergeJob from dlt.destinations.job_impl import NewReferenceJob -from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper -from dlt.common.storages import FilesystemConfiguration, fsspec_from_config -from dlt import config class DatabricksTypeMapper(TypeMapper): @@ -258,10 +254,15 @@ def gen_delete_from_sql( class DatabricksClient(InsertValuesJobClient, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: DatabricksClientConfiguration) -> None: - sql_client = DatabricksSqlClient(config.normalize_dataset_name(schema), config.credentials) + def __init__( + self, + schema: Schema, + config: DatabricksClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + sql_client = DatabricksSqlClient( + config.normalize_dataset_name(schema), config.credentials, capabilities + ) super().__init__(schema, config, sql_client) self.config: DatabricksClientConfiguration = config self.sql_client: DatabricksSqlClient = sql_client # type: ignore[assignment] @@ -303,7 +304,7 @@ def _get_table_update_sql( sql = super()._get_table_update_sql(table_name, new_columns, generate_alter) cluster_list = [ - self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("cluster") + self.sql_client.escape_column_name(c["name"]) for c in new_columns if c.get("cluster") ] if cluster_list: @@ -317,14 +318,14 @@ def _from_db_type( return self.type_mapper.from_db_type(bq_t, precision, scale) def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: - name = self.capabilities.escape_identifier(c["name"]) + name = self.sql_client.escape_column_name(c["name"]) return ( f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" ) def _get_storage_table_query_columns(self) -> List[str]: fields = super()._get_storage_table_query_columns() - fields[1] = ( # Override because this is the only way to get data type with precision + fields[2] = ( # Override because this is the only way to get data type with precision "full_data_type" ) return fields diff --git a/dlt/destinations/impl/databricks/factory.py b/dlt/destinations/impl/databricks/factory.py index 7c6c95137d..56462714c1 100644 --- a/dlt/destinations/impl/databricks/factory.py +++ b/dlt/destinations/impl/databricks/factory.py @@ -1,12 +1,13 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.data_writers.escape import escape_databricks_identifier, escape_databricks_literal +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.destinations.impl.databricks.configuration import ( DatabricksCredentials, DatabricksClientConfiguration, ) -from dlt.destinations.impl.databricks import capabilities if t.TYPE_CHECKING: from dlt.destinations.impl.databricks.databricks import DatabricksClient @@ -15,8 +16,33 @@ class databricks(Destination[DatabricksClientConfiguration, "DatabricksClient"]): spec = DatabricksClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = None + caps.supported_loader_file_formats = [] + caps.preferred_staging_file_format = "parquet" + caps.supported_staging_file_formats = ["jsonl", "parquet"] + caps.escape_identifier = escape_databricks_identifier + # databricks identifiers are case insensitive and stored in lower case + # https://docs.databricks.com/en/sql/language-manual/sql-ref-identifiers.html + caps.escape_literal = escape_databricks_literal + caps.casefold_identifier = str.lower + caps.has_case_sensitive_identifiers = False + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + caps.max_identifier_length = 255 + caps.max_column_identifier_length = 255 + caps.max_query_length = 2 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 16 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = False + caps.supports_truncate_command = True + # caps.supports_transactions = False + caps.alter_add_multi_column = True + caps.supports_multiple_statements = False + caps.supports_clone_table = True + return caps @property def client_class(self) -> t.Type["DatabricksClient"]: diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 530b03715a..da91402803 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -1,5 +1,5 @@ from contextlib import contextmanager, suppress -from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence, List, Union, Dict +from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence, List, Tuple, Union, Dict from databricks import sql as databricks_lib @@ -21,18 +21,37 @@ raise_database_error, raise_open_connection_error, ) -from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction +from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame from dlt.destinations.impl.databricks.configuration import DatabricksCredentials -from dlt.destinations.impl.databricks import capabilities -from dlt.common.time import to_py_date, to_py_datetime + + +class DatabricksCursorImpl(DBApiCursorImpl): + """Use native data frame support if available""" + + native_cursor: DatabricksSqlCursor # type: ignore[assignment] + vector_size: ClassVar[int] = 2048 + + def df(self, chunk_size: int = None, **kwargs: Any) -> DataFrame: + if chunk_size is None: + return self.native_cursor.fetchall_arrow().to_pandas() + else: + df = self.native_cursor.fetchmany_arrow(chunk_size).to_pandas() + if df.shape[0] == 0: + return None + else: + return df class DatabricksSqlClient(SqlClientBase[DatabricksSqlConnection], DBTransaction): dbapi: ClassVar[DBApi] = databricks_lib - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def __init__(self, dataset_name: str, credentials: DatabricksCredentials) -> None: - super().__init__(credentials.catalog, dataset_name) + def __init__( + self, + dataset_name: str, + credentials: DatabricksCredentials, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(credentials.catalog, dataset_name, capabilities) self._conn: DatabricksSqlConnection = None self.credentials = credentials @@ -112,16 +131,13 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB db_args = args or kwargs or None with self._conn.cursor() as curr: # type: ignore[assignment] curr.execute(query, db_args) - yield DBApiCursorImpl(curr) # type: ignore[abstract] + yield DatabricksCursorImpl(curr) # type: ignore[abstract] - def fully_qualified_dataset_name(self, escape: bool = True) -> str: + def catalog_name(self, escape: bool = True) -> Optional[str]: + catalog = self.capabilities.casefold_identifier(self.credentials.catalog) if escape: - catalog = self.capabilities.escape_identifier(self.credentials.catalog) - dataset_name = self.capabilities.escape_identifier(self.dataset_name) - else: - catalog = self.credentials.catalog - dataset_name = self.dataset_name - return f"{catalog}.{dataset_name}" + catalog = self.capabilities.escape_identifier(catalog) + return catalog @staticmethod def _make_database_exception(ex: Exception) -> Exception: diff --git a/dlt/destinations/impl/destination/__init__.py b/dlt/destinations/impl/destination/__init__.py index 5b076df4c6..e69de29bb2 100644 --- a/dlt/destinations/impl/destination/__init__.py +++ b/dlt/destinations/impl/destination/__init__.py @@ -1,21 +0,0 @@ -from typing import Optional -from dlt.common.destination import DestinationCapabilitiesContext, TLoaderFileFormat -from dlt.common.destination.capabilities import TLoaderParallelismStrategy - - -def capabilities( - preferred_loader_file_format: TLoaderFileFormat = "typed-jsonl", - naming_convention: str = "direct", - max_table_nesting: Optional[int] = 0, - max_parallel_load_jobs: Optional[int] = 0, - loader_parallelism_strategy: Optional[TLoaderParallelismStrategy] = None, -) -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext.generic_capabilities(preferred_loader_file_format) - caps.supported_loader_file_formats = ["typed-jsonl", "parquet"] - caps.supports_ddl_transactions = False - caps.supports_transactions = False - caps.naming_convention = naming_convention - caps.max_table_nesting = max_table_nesting - caps.max_parallel_load_jobs = max_parallel_load_jobs - caps.loader_parallelism_strategy = loader_parallelism_strategy - return caps diff --git a/dlt/destinations/impl/destination/configuration.py b/dlt/destinations/impl/destination/configuration.py index c3b677058c..705f3b0bb5 100644 --- a/dlt/destinations/impl/destination/configuration.py +++ b/dlt/destinations/impl/destination/configuration.py @@ -1,20 +1,23 @@ import dataclasses -from typing import Optional, Final, Callable, Union +from typing import Optional, Final, Callable, Union, Any from typing_extensions import ParamSpec -from dlt.common.configuration import configspec +from dlt.common.configuration import configspec, ConfigurationValueError from dlt.common.destination import TLoaderFileFormat from dlt.common.destination.reference import ( DestinationClientConfiguration, ) from dlt.common.typing import TDataItems from dlt.common.schema import TTableSchema -from dlt.common.destination import Destination TDestinationCallable = Callable[[Union[TDataItems, str], TTableSchema], None] TDestinationCallableParams = ParamSpec("TDestinationCallableParams") +def dummy_custom_destination(*args: Any, **kwargs: Any) -> None: + pass + + @configspec class CustomDestinationClientConfiguration(DestinationClientConfiguration): destination_type: Final[str] = dataclasses.field(default="destination", init=False, repr=False, compare=False) # type: ignore @@ -23,3 +26,15 @@ class CustomDestinationClientConfiguration(DestinationClientConfiguration): batch_size: int = 10 skip_dlt_columns_and_tables: bool = True max_table_nesting: Optional[int] = 0 + + def ensure_callable(self) -> None: + """Makes sure that valid callable was provided""" + # TODO: this surely can be done with `on_resolved` + if ( + self.destination_callable is None + or self.destination_callable is dummy_custom_destination + ): + raise ConfigurationValueError( + f"A valid callable was not provided to {self.__class__.__name__}. Did you decorate" + " a function @dlt.destination correctly?" + ) diff --git a/dlt/destinations/impl/destination/destination.py b/dlt/destinations/impl/destination/destination.py index 69d1d1d98a..c44fd3cca1 100644 --- a/dlt/destinations/impl/destination/destination.py +++ b/dlt/destinations/impl/destination/destination.py @@ -15,8 +15,6 @@ DoNothingJob, JobClientBase, ) - -from dlt.destinations.impl.destination import capabilities from dlt.destinations.impl.destination.configuration import CustomDestinationClientConfiguration from dlt.destinations.job_impl import ( DestinationJsonlLoadJob, @@ -27,10 +25,14 @@ class DestinationClient(JobClientBase): """Sink Client""" - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: CustomDestinationClientConfiguration) -> None: - super().__init__(schema, config) + def __init__( + self, + schema: Schema, + config: CustomDestinationClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + config.ensure_callable() + super().__init__(schema, config, capabilities) self.config: CustomDestinationClientConfiguration = config # create pre-resolved callable to avoid multiple config resolutions during execution of the jobs self.destination_callable = create_resolved_partial( diff --git a/dlt/destinations/impl/destination/factory.py b/dlt/destinations/impl/destination/factory.py index b3127ab99b..69bb0daa13 100644 --- a/dlt/destinations/impl/destination/factory.py +++ b/dlt/destinations/impl/destination/factory.py @@ -4,18 +4,20 @@ from types import ModuleType from dlt.common import logger +from dlt.common.destination.capabilities import TLoaderParallelismStrategy +from dlt.common.exceptions import TerminalValueError +from dlt.common.normalizers.naming.naming import NamingConvention from dlt.common.typing import AnyFun from dlt.common.destination import Destination, DestinationCapabilitiesContext, TLoaderFileFormat from dlt.common.configuration import known_sections, with_config, get_fun_spec from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.common.utils import get_callable_name, is_inner_callable -from dlt.destinations.exceptions import DestinationTransientException from dlt.destinations.impl.destination.configuration import ( CustomDestinationClientConfiguration, + dummy_custom_destination, TDestinationCallable, ) -from dlt.destinations.impl.destination import capabilities if t.TYPE_CHECKING: from dlt.destinations.impl.destination.destination import DestinationClient @@ -34,16 +36,16 @@ class DestinationInfo(t.NamedTuple): class destination(Destination[CustomDestinationClientConfiguration, "DestinationClient"]): - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities( - preferred_loader_file_format=self.config_params.get( - "loader_file_format", "typed-jsonl" - ), - naming_convention=self.config_params.get("naming_convention", "direct"), - max_table_nesting=self.config_params.get("max_table_nesting", None), - max_parallel_load_jobs=self.config_params.get("max_parallel_load_jobs", None), - loader_parallelism_strategy=self.config_params.get("loader_parallelism_strategy", None), - ) + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext.generic_capabilities("typed-jsonl") + caps.supported_loader_file_formats = ["typed-jsonl", "parquet"] + caps.supports_ddl_transactions = False + caps.supports_transactions = False + caps.naming_convention = "direct" + caps.max_table_nesting = 0 + caps.max_parallel_load_jobs = 0 + caps.loader_parallelism_strategy = None + return caps @property def spec(self) -> t.Type[CustomDestinationClientConfiguration]: @@ -68,7 +70,7 @@ def __init__( **kwargs: t.Any, ) -> None: if spec and not issubclass(spec, CustomDestinationClientConfiguration): - raise ValueError( + raise TerminalValueError( "A SPEC for a sink destination must use CustomDestinationClientConfiguration as a" " base." ) @@ -97,14 +99,7 @@ def __init__( "No destination callable provided, providing dummy callable which will fail on" " load." ) - - def dummy_callable(*args: t.Any, **kwargs: t.Any) -> None: - raise DestinationTransientException( - "You tried to load to a custom destination without a valid callable." - ) - - destination_callable = dummy_callable - + destination_callable = dummy_custom_destination elif not callable(destination_callable): raise ConfigurationValueError("Resolved Sink destination callable is not a callable.") @@ -138,9 +133,21 @@ def dummy_callable(*args: t.Any, **kwargs: t.Any) -> None: super().__init__( destination_name=destination_name, environment=environment, + # NOTE: `loader_file_format` is not a field in the caps so we had to hack the base class to allow this loader_file_format=loader_file_format, batch_size=batch_size, naming_convention=naming_convention, destination_callable=conf_callable, **kwargs, ) + + @classmethod + def adjust_capabilities( + cls, + caps: DestinationCapabilitiesContext, + config: CustomDestinationClientConfiguration, + naming: t.Optional[NamingConvention], + ) -> DestinationCapabilitiesContext: + caps = super().adjust_capabilities(caps, config, naming) + caps.preferred_loader_file_format = config.loader_file_format + return caps diff --git a/dlt/destinations/impl/dremio/__init__.py b/dlt/destinations/impl/dremio/__init__.py index b4bde2fe6d..96d4748f1d 100644 --- a/dlt/destinations/impl/dremio/__init__.py +++ b/dlt/destinations/impl/dremio/__init__.py @@ -10,6 +10,9 @@ def capabilities() -> DestinationCapabilitiesContext: caps.preferred_staging_file_format = "parquet" caps.supported_staging_file_formats = ["jsonl", "parquet"] caps.escape_identifier = escape_dremio_identifier + # all identifiers are case insensitive but are stored as is + # https://docs.dremio.com/current/sonar/data-sources + caps.has_case_sensitive_identifiers = False caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) caps.max_identifier_length = 255 diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py index 23bca0ad74..00e51b74a6 100644 --- a/dlt/destinations/impl/dremio/dremio.py +++ b/dlt/destinations/impl/dremio/dremio.py @@ -14,7 +14,6 @@ from dlt.common.storages.file_storage import FileStorage from dlt.common.utils import uniq_id from dlt.destinations.exceptions import LoadJobTerminalException -from dlt.destinations.impl.dremio import capabilities from dlt.destinations.impl.dremio.configuration import DremioClientConfiguration from dlt.destinations.impl.dremio.sql_client import DremioSqlClient from dlt.destinations.job_client_impl import SqlJobClientWithStaging @@ -137,10 +136,15 @@ def exception(self) -> str: class DremioClient(SqlJobClientWithStaging, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: DremioClientConfiguration) -> None: - sql_client = DremioSqlClient(config.normalize_dataset_name(schema), config.credentials) + def __init__( + self, + schema: Schema, + config: DremioClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + sql_client = DremioSqlClient( + config.normalize_dataset_name(schema), config.credentials, capabilities + ) super().__init__(schema, config, sql_client) self.config: DremioClientConfiguration = config self.sql_client: DremioSqlClient = sql_client # type: ignore @@ -172,7 +176,7 @@ def _get_table_update_sql( if not generate_alter: partition_list = [ - self.capabilities.escape_identifier(c["name"]) + self.sql_client.escape_column_name(c["name"]) for c in new_columns if c.get("partition") ] @@ -180,7 +184,7 @@ def _get_table_update_sql( sql[0] += "\nPARTITION BY (" + ",".join(partition_list) + ")" sort_list = [ - self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("sort") + self.sql_client.escape_column_name(c["name"]) for c in new_columns if c.get("sort") ] if sort_list: sql[0] += "\nLOCALSORT BY (" + ",".join(sort_list) + ")" @@ -193,45 +197,11 @@ def _from_db_type( return self.type_mapper.from_db_type(bq_t, precision, scale) def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: - name = self.capabilities.escape_identifier(c["name"]) + name = self.sql_client.escape_column_name(c["name"]) return ( f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" ) - def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: - def _null_to_bool(v: str) -> bool: - if v == "NO": - return False - elif v == "YES": - return True - raise ValueError(v) - - fields = self._get_storage_table_query_columns() - table_schema = self.sql_client.fully_qualified_dataset_name(escape=False) - db_params = (table_schema, table_name) - query = f""" -SELECT {",".join(fields)} - FROM INFORMATION_SCHEMA.COLUMNS -WHERE - table_catalog = 'DREMIO' AND table_schema = %s AND table_name = %s ORDER BY ordinal_position; -""" - rows = self.sql_client.execute_sql(query, *db_params) - - # if no rows we assume that table does not exist - schema_table: TTableSchemaColumns = {} - if len(rows) == 0: - return False, schema_table - for c in rows: - numeric_precision = c[3] - numeric_scale = c[4] - schema_c: TColumnSchemaBase = { - "name": c[0], - "nullable": _null_to_bool(c[2]), - **self._from_db_type(c[1], numeric_precision, numeric_scale), - } - schema_table[c[0]] = schema_c # type: ignore - return True, schema_table - def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [DremioMergeJob.from_table_chain(table_chain, self.sql_client)] diff --git a/dlt/destinations/impl/dremio/factory.py b/dlt/destinations/impl/dremio/factory.py index 61895e4f90..29a4937c69 100644 --- a/dlt/destinations/impl/dremio/factory.py +++ b/dlt/destinations/impl/dremio/factory.py @@ -1,11 +1,13 @@ import typing as t +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE +from dlt.common.data_writers.escape import escape_dremio_identifier + from dlt.destinations.impl.dremio.configuration import ( DremioCredentials, DremioClientConfiguration, ) -from dlt.destinations.impl.dremio import capabilities -from dlt.common.destination import Destination, DestinationCapabilitiesContext if t.TYPE_CHECKING: from dlt.destinations.impl.dremio.dremio import DremioClient @@ -14,8 +16,31 @@ class dremio(Destination[DremioClientConfiguration, "DremioClient"]): spec = DremioClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = None + caps.supported_loader_file_formats = [] + caps.preferred_staging_file_format = "parquet" + caps.supported_staging_file_formats = ["jsonl", "parquet"] + caps.escape_identifier = escape_dremio_identifier + # all identifiers are case insensitive but are stored as is + # https://docs.dremio.com/current/sonar/data-sources + caps.has_case_sensitive_identifiers = False + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + caps.max_identifier_length = 255 + caps.max_column_identifier_length = 255 + caps.max_query_length = 2 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 16 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_transactions = False + caps.supports_ddl_transactions = False + caps.alter_add_multi_column = True + caps.supports_clone_table = False + caps.supports_multiple_statements = False + caps.timestamp_precision = 3 + return caps @property def client_class(self) -> t.Type["DremioClient"]: diff --git a/dlt/destinations/impl/dremio/sql_client.py b/dlt/destinations/impl/dremio/sql_client.py index 255c8acee0..fac65e7fd0 100644 --- a/dlt/destinations/impl/dremio/sql_client.py +++ b/dlt/destinations/impl/dremio/sql_client.py @@ -1,5 +1,5 @@ from contextlib import contextmanager, suppress -from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence, List +from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence, List, Tuple import pyarrow @@ -10,7 +10,7 @@ DatabaseUndefinedRelation, DatabaseTransientException, ) -from dlt.destinations.impl.dremio import capabilities, pydremio +from dlt.destinations.impl.dremio import pydremio from dlt.destinations.impl.dremio.configuration import DremioCredentials from dlt.destinations.sql_client import ( DBApiCursorImpl, @@ -32,10 +32,14 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> Optional[DataFrame]: class DremioSqlClient(SqlClientBase[pydremio.DremioConnection]): dbapi: ClassVar[DBApi] = pydremio - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def __init__(self, dataset_name: str, credentials: DremioCredentials) -> None: - super().__init__(credentials.database, dataset_name) + def __init__( + self, + dataset_name: str, + credentials: DremioCredentials, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(credentials.database, dataset_name, capabilities) self._conn: Optional[pydremio.DremioConnection] = None self.credentials = credentials @@ -99,18 +103,16 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB raise DatabaseTransientException(ex) yield DremioCursorImpl(curr) # type: ignore - def fully_qualified_dataset_name(self, escape: bool = True) -> str: - database_name = self.credentials.database - dataset_name = self.dataset_name + def catalog_name(self, escape: bool = True) -> Optional[str]: + database_name = self.capabilities.casefold_identifier(self.database_name) if escape: database_name = self.capabilities.escape_identifier(database_name) - dataset_name = self.capabilities.escape_identifier(dataset_name) - return f"{database_name}.{dataset_name}" + return database_name - def make_qualified_table_name(self, table_name: str, escape: bool = True) -> str: - if escape: - table_name = self.capabilities.escape_identifier(table_name) - return f"{self.fully_qualified_dataset_name(escape=escape)}.{table_name}" + def _get_information_schema_components(self, *tables: str) -> Tuple[str, str, List[str]]: + components = super()._get_information_schema_components(*tables) + # catalog is always DREMIO but schema contains "database" prefix 🤷 + return ("DREMIO", self.fully_qualified_dataset_name(escape=False), components[2]) @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: @@ -138,10 +140,10 @@ def _get_table_names(self) -> List[str]: query = """ SELECT TABLE_NAME FROM INFORMATION_SCHEMA."TABLES" - WHERE TABLE_CATALOG = 'DREMIO' AND TABLE_SCHEMA = %s + WHERE TABLE_CATALOG = %s AND TABLE_SCHEMA = %s """ - db_params = [self.fully_qualified_dataset_name(escape=False)] - tables = self.execute_sql(query, *db_params) or [] + catalog_name, schema_name, _ = self._get_information_schema_components() + tables = self.execute_sql(query, catalog_name, schema_name) or [] return [table[0] for table in tables] def drop_dataset(self) -> None: diff --git a/dlt/destinations/impl/duckdb/__init__.py b/dlt/destinations/impl/duckdb/__init__.py index 5cbc8dea53..e69de29bb2 100644 --- a/dlt/destinations/impl/duckdb/__init__.py +++ b/dlt/destinations/impl/duckdb/__init__.py @@ -1,26 +0,0 @@ -from dlt.common.data_writers.escape import escape_postgres_identifier, escape_duckdb_literal -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "insert_values" - caps.supported_loader_file_formats = ["insert_values", "parquet", "jsonl"] - caps.preferred_staging_file_format = None - caps.supported_staging_file_formats = [] - caps.escape_identifier = escape_postgres_identifier - caps.escape_literal = escape_duckdb_literal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - caps.max_identifier_length = 65536 - caps.max_column_identifier_length = 65536 - caps.max_query_length = 32 * 1024 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 1024 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = True - caps.alter_add_multi_column = False - caps.supports_truncate_command = False - - return caps diff --git a/dlt/destinations/impl/duckdb/duck.py b/dlt/destinations/impl/duckdb/duck.py index 7016e9bfff..b87a2c4780 100644 --- a/dlt/destinations/impl/duckdb/duck.py +++ b/dlt/destinations/impl/duckdb/duck.py @@ -12,7 +12,6 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.impl.duckdb import capabilities from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration from dlt.destinations.type_mapping import TypeMapper @@ -151,10 +150,15 @@ def exception(self) -> str: class DuckDbClient(InsertValuesJobClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: DuckDbClientConfiguration) -> None: - sql_client = DuckDbSqlClient(config.normalize_dataset_name(schema), config.credentials) + def __init__( + self, + schema: Schema, + config: DuckDbClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + sql_client = DuckDbSqlClient( + config.normalize_dataset_name(schema), config.credentials, capabilities + ) super().__init__(schema, config, sql_client) self.config: DuckDbClientConfiguration = config self.sql_client: DuckDbSqlClient = sql_client # type: ignore @@ -173,7 +177,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non for h in self.active_hints.keys() if c.get(h, False) is True ) - column_name = self.capabilities.escape_identifier(c["name"]) + column_name = self.sql_client.escape_column_name(c["name"]) return ( f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" ) diff --git a/dlt/destinations/impl/duckdb/factory.py b/dlt/destinations/impl/duckdb/factory.py index 55fcd3b339..388f914479 100644 --- a/dlt/destinations/impl/duckdb/factory.py +++ b/dlt/destinations/impl/duckdb/factory.py @@ -1,8 +1,10 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.data_writers.escape import escape_postgres_identifier, escape_duckdb_literal +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE + from dlt.destinations.impl.duckdb.configuration import DuckDbCredentials, DuckDbClientConfiguration -from dlt.destinations.impl.duckdb import capabilities if t.TYPE_CHECKING: from duckdb import DuckDBPyConnection @@ -12,8 +14,29 @@ class duckdb(Destination[DuckDbClientConfiguration, "DuckDbClient"]): spec = DuckDbClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "insert_values" + caps.supported_loader_file_formats = ["insert_values", "parquet", "jsonl"] + caps.preferred_staging_file_format = None + caps.supported_staging_file_formats = [] + caps.escape_identifier = escape_postgres_identifier + # all identifiers are case insensitive but are stored as is + caps.escape_literal = escape_duckdb_literal + caps.has_case_sensitive_identifiers = False + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + caps.max_identifier_length = 65536 + caps.max_column_identifier_length = 65536 + caps.max_query_length = 32 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 1024 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = True + caps.alter_add_multi_column = False + caps.supports_truncate_command = False + + return caps @property def client_class(self) -> t.Type["DuckDbClient"]: diff --git a/dlt/destinations/impl/duckdb/sql_client.py b/dlt/destinations/impl/duckdb/sql_client.py index bb85b5825b..95762a1f26 100644 --- a/dlt/destinations/impl/duckdb/sql_client.py +++ b/dlt/destinations/impl/duckdb/sql_client.py @@ -17,12 +17,11 @@ raise_open_connection_error, ) -from dlt.destinations.impl.duckdb import capabilities from dlt.destinations.impl.duckdb.configuration import DuckDbBaseCredentials class DuckDBDBApiCursorImpl(DBApiCursorImpl): - """Use native BigQuery data frame support if available""" + """Use native duckdb data frame support if available""" native_cursor: duckdb.DuckDBPyConnection # type: ignore vector_size: ClassVar[int] = 2048 @@ -43,10 +42,14 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> DataFrame: class DuckDbSqlClient(SqlClientBase[duckdb.DuckDBPyConnection], DBTransaction): dbapi: ClassVar[DBApi] = duckdb - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def __init__(self, dataset_name: str, credentials: DuckDbBaseCredentials) -> None: - super().__init__(None, dataset_name) + def __init__( + self, + dataset_name: str, + credentials: DuckDbBaseCredentials, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(None, dataset_name, capabilities) self._conn: duckdb.DuckDBPyConnection = None self.credentials = credentials @@ -142,11 +145,6 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB # else: # return None - def fully_qualified_dataset_name(self, escape: bool = True) -> str: - return ( - self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name - ) - @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: if isinstance(ex, (duckdb.CatalogException)): diff --git a/dlt/destinations/impl/dummy/__init__.py b/dlt/destinations/impl/dummy/__init__.py index e09f7d07a9..e69de29bb2 100644 --- a/dlt/destinations/impl/dummy/__init__.py +++ b/dlt/destinations/impl/dummy/__init__.py @@ -1,39 +0,0 @@ -from typing import List -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.capabilities import TLoaderFileFormat - -from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration - - -@with_config( - spec=DummyClientConfiguration, - sections=( - known_sections.DESTINATION, - "dummy", - ), -) -def _configure(config: DummyClientConfiguration = config.value) -> DummyClientConfiguration: - return config - - -def capabilities() -> DestinationCapabilitiesContext: - config = _configure() - additional_formats: List[TLoaderFileFormat] = ( - ["reference"] if config.create_followup_jobs else [] # type:ignore[list-item] - ) - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = config.loader_file_format - caps.supported_loader_file_formats = additional_formats + [config.loader_file_format] - caps.preferred_staging_file_format = None - caps.supported_staging_file_formats = additional_formats + [config.loader_file_format] - caps.max_identifier_length = 127 - caps.max_column_identifier_length = 127 - caps.max_query_length = 8 * 1024 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 65536 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = False - - return caps diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index 3c78493b57..c41b7dca61 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -36,7 +36,6 @@ LoadJobNotExistsException, LoadJobInvalidStateTransitionException, ) -from dlt.destinations.impl.dummy import capabilities from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration from dlt.destinations.job_impl import NewReferenceJob @@ -110,10 +109,13 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]: class DummyClient(JobClientBase, SupportsStagingDestination, WithStagingDataset): """dummy client storing jobs in memory""" - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: DummyClientConfiguration) -> None: - super().__init__(schema, config) + def __init__( + self, + schema: Schema, + config: DummyClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(schema, config, capabilities) self.in_staging_context = False self.config: DummyClientConfiguration = config @@ -160,7 +162,7 @@ def restore_file_load(self, file_path: str) -> LoadJob: def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], - table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, + completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, ) -> List[NewLoadJob]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] diff --git a/dlt/destinations/impl/dummy/factory.py b/dlt/destinations/impl/dummy/factory.py index 1c848cf22d..c68bc36ca9 100644 --- a/dlt/destinations/impl/dummy/factory.py +++ b/dlt/destinations/impl/dummy/factory.py @@ -2,11 +2,12 @@ from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.normalizers.naming.naming import NamingConvention from dlt.destinations.impl.dummy.configuration import ( DummyClientConfiguration, DummyClientCredentials, ) -from dlt.destinations.impl.dummy import capabilities if t.TYPE_CHECKING: from dlt.destinations.impl.dummy.dummy import DummyClient @@ -15,8 +16,19 @@ class dummy(Destination[DummyClientConfiguration, "DummyClient"]): spec = DummyClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_staging_file_format = None + caps.has_case_sensitive_identifiers = True + caps.max_identifier_length = 127 + caps.max_column_identifier_length = 127 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 65536 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = False + + return caps @property def client_class(self) -> t.Type["DummyClient"]: @@ -37,3 +49,19 @@ def __init__( environment=environment, **kwargs, ) + + @classmethod + def adjust_capabilities( + cls, + caps: DestinationCapabilitiesContext, + config: DummyClientConfiguration, + naming: t.Optional[NamingConvention], + ) -> DestinationCapabilitiesContext: + caps = super().adjust_capabilities(caps, config, naming) + additional_formats: t.List[TLoaderFileFormat] = ( + ["reference"] if config.create_followup_jobs else [] # type:ignore[list-item] + ) + caps.preferred_loader_file_format = config.loader_file_format + caps.supported_loader_file_formats = additional_formats + [config.loader_file_format] + caps.supported_staging_file_formats = additional_formats + [config.loader_file_format] + return caps diff --git a/dlt/destinations/impl/filesystem/__init__.py b/dlt/destinations/impl/filesystem/__init__.py index 49fabd61d7..e69de29bb2 100644 --- a/dlt/destinations/impl/filesystem/__init__.py +++ b/dlt/destinations/impl/filesystem/__init__.py @@ -1,24 +0,0 @@ -from typing import Sequence, Tuple - -from dlt.common.schema.typing import TTableSchema -from dlt.common.destination import DestinationCapabilitiesContext, TLoaderFileFormat - - -def loader_file_format_adapter( - preferred_loader_file_format: TLoaderFileFormat, - supported_loader_file_formats: Sequence[TLoaderFileFormat], - /, - *, - table_schema: TTableSchema, -) -> Tuple[TLoaderFileFormat, Sequence[TLoaderFileFormat]]: - if table_schema.get("table_format") == "delta": - return ("parquet", ["parquet"]) - return (preferred_loader_file_format, supported_loader_file_formats) - - -def capabilities() -> DestinationCapabilitiesContext: - return DestinationCapabilitiesContext.generic_capabilities( - preferred_loader_file_format="jsonl", - loader_file_format_adapter=loader_file_format_adapter, - supported_table_formats=["delta"], - ) diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py index 029a5bdda5..1e6eec5cce 100644 --- a/dlt/destinations/impl/filesystem/factory.py +++ b/dlt/destinations/impl/filesystem/factory.py @@ -1,19 +1,38 @@ import typing as t -from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration -from dlt.destinations.impl.filesystem import capabilities -from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.destination import Destination, DestinationCapabilitiesContext, TLoaderFileFormat +from dlt.common.destination.reference import DEFAULT_FILE_LAYOUT +from dlt.common.schema.typing import TTableSchema from dlt.common.storages.configuration import FileSystemCredentials +from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration +from dlt.destinations.impl.filesystem.typing import TCurrentDateTime, TExtraPlaceholders + if t.TYPE_CHECKING: from dlt.destinations.impl.filesystem.filesystem import FilesystemClient +def loader_file_format_adapter( + preferred_loader_file_format: TLoaderFileFormat, + supported_loader_file_formats: t.Sequence[TLoaderFileFormat], + /, + *, + table_schema: TTableSchema, +) -> t.Tuple[TLoaderFileFormat, t.Sequence[TLoaderFileFormat]]: + if table_schema.get("table_format") == "delta": + return ("parquet", ["parquet"]) + return (preferred_loader_file_format, supported_loader_file_formats) + + class filesystem(Destination[FilesystemDestinationClientConfiguration, "FilesystemClient"]): spec = FilesystemDestinationClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + return DestinationCapabilitiesContext.generic_capabilities( + preferred_loader_file_format="jsonl", + loader_file_format_adapter=loader_file_format_adapter, + supported_table_formats=["delta"], + ) @property def client_class(self) -> t.Type["FilesystemClient"]: @@ -25,6 +44,9 @@ def __init__( self, bucket_url: str = None, credentials: t.Union[FileSystemCredentials, t.Dict[str, t.Any], t.Any] = None, + layout: str = DEFAULT_FILE_LAYOUT, + extra_placeholders: t.Optional[TExtraPlaceholders] = None, + current_datetime: t.Optional[TCurrentDateTime] = None, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -46,11 +68,20 @@ def __init__( credentials: Credentials to connect to the filesystem. The type of credentials should correspond to the bucket protocol. For example, for AWS S3, the credentials should be an instance of `AwsCredentials`. A dictionary with the credentials parameters can also be provided. + layout (str): A layout of the files holding table data in the destination bucket/filesystem. Uses a set of pre-defined + and user-defined (extra) placeholders. Please refer to https://dlthub.com/docs/dlt-ecosystem/destinations/filesystem#files-layout + extra_placeholders (dict(str, str | callable)): A dictionary of extra placeholder names that can be used in the `layout` parameter. Names + are mapped to string values or to callables evaluated at runtime. + current_datetime (DateTime | callable): current datetime used by date/time related placeholders. If not provided, load package creation timestamp + will be used. **kwargs: Additional arguments passed to the destination config """ super().__init__( bucket_url=bucket_url, credentials=credentials, + layout=layout, + extra_placeholders=extra_placeholders, + current_datetime=current_datetime, destination_name=destination_name, environment=environment, **kwargs, diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 9d15ba959e..00b990d4fa 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -12,7 +12,7 @@ from dlt.common.typing import DictStrAny from dlt.common.schema import Schema, TSchemaTables, TTableSchema from dlt.common.storages import FileStorage, fsspec_from_config -from dlt.common.storages.load_package import LoadJobInfo, ParsedLoadJobFileName +from dlt.common.storages.load_package import LoadJobInfo, ParsedLoadJobFileName, TPipelineStateDoc from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( NewLoadJob, @@ -29,7 +29,6 @@ ) from dlt.common.destination.exceptions import DestinationUndefinedEntity from dlt.destinations.job_impl import EmptyLoadJob, NewReferenceJob -from dlt.destinations.impl.filesystem import capabilities from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations import path_utils @@ -153,15 +152,19 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]: class FilesystemClient(FSClientBase, JobClientBase, WithStagingDataset, WithStateSync): """filesystem client storing jobs in memory""" - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() fs_client: AbstractFileSystem # a path (without the scheme) to a location in the bucket where dataset is present bucket_path: str # name of the dataset dataset_name: str - def __init__(self, schema: Schema, config: FilesystemDestinationClientConfiguration) -> None: - super().__init__(schema, config) + def __init__( + self, + schema: Schema, + config: FilesystemDestinationClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(schema, config, capabilities) self.fs_client, fs_path = fsspec_from_config(config) self.is_local_filesystem = config.protocol == "file" self.bucket_path = ( @@ -365,7 +368,7 @@ def _write_to_json_file(self, filepath: str, data: DictStrAny) -> None: dirname = self.pathlib.dirname(filepath) if not self.fs_client.isdir(dirname): return - self.fs_client.write_text(filepath, json.dumps(data), "utf-8") + self.fs_client.write_text(filepath, json.dumps(data), encoding="utf-8") def _to_path_safe_string(self, s: str) -> str: """for base64 strings""" @@ -447,8 +450,13 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: # Load compressed state from destination if selected_path: - state_json = json.loads(self.fs_client.read_text(selected_path)) - state_json.pop("version_hash") + state_json: TPipelineStateDoc = json.loads( + self.fs_client.read_text(selected_path, encoding="utf-8") + ) + # we had dlt_load_id stored until version 0.5 and since we do not have any version control + # we always migrate + if load_id := state_json.pop("dlt_load_id", None): # type: ignore[typeddict-item] + state_json["_dlt_load_id"] = load_id return StateInfo(**state_json) return None @@ -491,7 +499,9 @@ def _get_stored_schema_by_hash_or_newest( break if selected_path: - return StorageSchemaInfo(**json.loads(self.fs_client.read_text(selected_path))) + return StorageSchemaInfo( + **json.loads(self.fs_client.read_text(selected_path, encoding="utf-8")) + ) return None @@ -528,19 +538,23 @@ def get_stored_schema_by_hash(self, version_hash: str) -> Optional[StorageSchema def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], - table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, + completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, ) -> List[NewLoadJob]: def get_table_jobs( table_jobs: Sequence[LoadJobInfo], table_name: str ) -> Sequence[LoadJobInfo]: return [job for job in table_jobs if job.job_file_info.table_name == table_name] - assert table_chain_jobs is not None - jobs = super().create_table_chain_completed_followup_jobs(table_chain, table_chain_jobs) + assert completed_table_chain_jobs is not None + jobs = super().create_table_chain_completed_followup_jobs( + table_chain, completed_table_chain_jobs + ) table_format = table_chain[0].get("table_format") if table_format == "delta": delta_jobs = [ - DeltaLoadFilesystemJob(self, table, get_table_jobs(table_chain_jobs, table["name"])) + DeltaLoadFilesystemJob( + self, table, get_table_jobs(completed_table_chain_jobs, table["name"]) + ) for table in table_chain ] jobs.extend(delta_jobs) diff --git a/dlt/destinations/impl/filesystem/typing.py b/dlt/destinations/impl/filesystem/typing.py index 139602198d..6781fe21ac 100644 --- a/dlt/destinations/impl/filesystem/typing.py +++ b/dlt/destinations/impl/filesystem/typing.py @@ -15,5 +15,7 @@ `schema name`, `table name`, `load_id`, `file_id` and an `extension` """ -TExtraPlaceholders: TypeAlias = Dict[str, Union[str, TLayoutPlaceholderCallback]] +TExtraPlaceholders: TypeAlias = Dict[ + str, Union[Union[str, int, DateTime], TLayoutPlaceholderCallback] +] """Extra placeholders for filesystem layout""" diff --git a/dlt/destinations/impl/motherduck/__init__.py b/dlt/destinations/impl/motherduck/__init__.py index 74c0e36ef3..e69de29bb2 100644 --- a/dlt/destinations/impl/motherduck/__init__.py +++ b/dlt/destinations/impl/motherduck/__init__.py @@ -1,24 +0,0 @@ -from dlt.common.data_writers.escape import escape_postgres_identifier, escape_duckdb_literal -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "parquet" - caps.supported_loader_file_formats = ["parquet", "insert_values", "jsonl"] - caps.escape_identifier = escape_postgres_identifier - caps.escape_literal = escape_duckdb_literal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - caps.max_identifier_length = 65536 - caps.max_column_identifier_length = 65536 - caps.max_query_length = 512 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 1024 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = False - caps.alter_add_multi_column = False - caps.supports_truncate_command = False - - return caps diff --git a/dlt/destinations/impl/motherduck/factory.py b/dlt/destinations/impl/motherduck/factory.py index 5e35f69d75..df7418b9db 100644 --- a/dlt/destinations/impl/motherduck/factory.py +++ b/dlt/destinations/impl/motherduck/factory.py @@ -1,11 +1,13 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.data_writers.escape import escape_postgres_identifier, escape_duckdb_literal +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE + from dlt.destinations.impl.motherduck.configuration import ( MotherDuckCredentials, MotherDuckClientConfiguration, ) -from dlt.destinations.impl.motherduck import capabilities if t.TYPE_CHECKING: from duckdb import DuckDBPyConnection @@ -15,8 +17,27 @@ class motherduck(Destination[MotherDuckClientConfiguration, "MotherDuckClient"]): spec = MotherDuckClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "parquet" + caps.supported_loader_file_formats = ["parquet", "insert_values", "jsonl"] + caps.escape_identifier = escape_postgres_identifier + # all identifiers are case insensitive but are stored as is + caps.escape_literal = escape_duckdb_literal + caps.has_case_sensitive_identifiers = False + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + caps.max_identifier_length = 65536 + caps.max_column_identifier_length = 65536 + caps.max_query_length = 512 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 1024 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = False + caps.alter_add_multi_column = False + caps.supports_truncate_command = False + + return caps @property def client_class(self) -> t.Type["MotherDuckClient"]: diff --git a/dlt/destinations/impl/motherduck/motherduck.py b/dlt/destinations/impl/motherduck/motherduck.py index c695d9715e..3a5f172864 100644 --- a/dlt/destinations/impl/motherduck/motherduck.py +++ b/dlt/destinations/impl/motherduck/motherduck.py @@ -1,20 +1,22 @@ -from typing import ClassVar - from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.schema import Schema from dlt.destinations.impl.duckdb.duck import DuckDbClient -from dlt.destinations.impl.motherduck import capabilities from dlt.destinations.impl.motherduck.sql_client import MotherDuckSqlClient from dlt.destinations.impl.motherduck.configuration import MotherDuckClientConfiguration class MotherDuckClient(DuckDbClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: MotherDuckClientConfiguration) -> None: - super().__init__(schema, config) # type: ignore - sql_client = MotherDuckSqlClient(config.normalize_dataset_name(schema), config.credentials) + def __init__( + self, + schema: Schema, + config: MotherDuckClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(schema, config, capabilities) # type: ignore + sql_client = MotherDuckSqlClient( + config.normalize_dataset_name(schema), config.credentials, capabilities + ) self.config: MotherDuckClientConfiguration = config # type: ignore self.sql_client: MotherDuckSqlClient = sql_client diff --git a/dlt/destinations/impl/motherduck/sql_client.py b/dlt/destinations/impl/motherduck/sql_client.py index 7990f90947..40157406ab 100644 --- a/dlt/destinations/impl/motherduck/sql_client.py +++ b/dlt/destinations/impl/motherduck/sql_client.py @@ -1,41 +1,22 @@ -import duckdb +from typing import Optional -from contextlib import contextmanager -from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence -from dlt.common.destination import DestinationCapabilitiesContext - -from dlt.destinations.exceptions import ( - DatabaseTerminalException, - DatabaseTransientException, - DatabaseUndefinedRelation, -) -from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame -from dlt.destinations.sql_client import ( - SqlClientBase, - DBApiCursorImpl, - raise_database_error, - raise_open_connection_error, -) - -from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient, DuckDBDBApiCursorImpl -from dlt.destinations.impl.motherduck import capabilities +from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials class MotherDuckSqlClient(DuckDbSqlClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, dataset_name: str, credentials: MotherDuckCredentials) -> None: - super().__init__(dataset_name, credentials) + def __init__( + self, + dataset_name: str, + credentials: MotherDuckCredentials, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(dataset_name, credentials, capabilities) self.database_name = credentials.database - def fully_qualified_dataset_name(self, escape: bool = True) -> str: - database_name = ( - self.capabilities.escape_identifier(self.database_name) - if escape - else self.database_name - ) - dataset_name = ( - self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name - ) - return f"{database_name}.{dataset_name}" + def catalog_name(self, escape: bool = True) -> Optional[str]: + database_name = self.database_name + if escape: + database_name = self.capabilities.escape_identifier(database_name) + return database_name diff --git a/dlt/destinations/impl/mssql/__init__.py b/dlt/destinations/impl/mssql/__init__.py index f7768d9238..e69de29bb2 100644 --- a/dlt/destinations/impl/mssql/__init__.py +++ b/dlt/destinations/impl/mssql/__init__.py @@ -1,29 +0,0 @@ -from dlt.common.data_writers.escape import escape_postgres_identifier, escape_mssql_literal -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.common.wei import EVM_DECIMAL_PRECISION - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "insert_values" - caps.supported_loader_file_formats = ["insert_values"] - caps.preferred_staging_file_format = None - caps.supported_staging_file_formats = [] - caps.escape_identifier = escape_postgres_identifier - caps.escape_literal = escape_mssql_literal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - # https://learn.microsoft.com/en-us/sql/sql-server/maximum-capacity-specifications-for-sql-server?view=sql-server-ver16&redirectedfrom=MSDN - caps.max_identifier_length = 128 - caps.max_column_identifier_length = 128 - # A SQL Query can be a varchar(max) but is shown as limited to 65,536 * Network Packet - caps.max_query_length = 65536 * 10 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 2**30 - 1 - caps.is_max_text_data_type_length_in_bytes = False - caps.supports_ddl_transactions = True - caps.max_rows_per_insert = 1000 - caps.timestamp_precision = 7 - - return caps diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index 8a50ecc6d2..64d87065f3 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -93,6 +93,7 @@ class MsSqlClientConfiguration(DestinationClientDwhWithStagingConfiguration): credentials: MsSqlCredentials = None create_indexes: bool = False + has_case_sensitive_identifiers: bool = False def fingerprint(self) -> str: """Returns a fingerprint of host part of a connection string""" diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py index 2e19d7c2a8..6912510995 100644 --- a/dlt/destinations/impl/mssql/factory.py +++ b/dlt/destinations/impl/mssql/factory.py @@ -1,30 +1,58 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.normalizers.naming.naming import NamingConvention +from dlt.common.data_writers.escape import escape_postgres_identifier, escape_mssql_literal +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.destinations.impl.mssql.configuration import MsSqlCredentials, MsSqlClientConfiguration -from dlt.destinations.impl.mssql import capabilities if t.TYPE_CHECKING: - from dlt.destinations.impl.mssql.mssql import MsSqlClient + from dlt.destinations.impl.mssql.mssql import MsSqlJobClient -class mssql(Destination[MsSqlClientConfiguration, "MsSqlClient"]): +class mssql(Destination[MsSqlClientConfiguration, "MsSqlJobClient"]): spec = MsSqlClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "insert_values" + caps.supported_loader_file_formats = ["insert_values"] + caps.preferred_staging_file_format = None + caps.supported_staging_file_formats = [] + # mssql is by default case insensitive and stores identifiers as is + # case sensitivity can be changed by database collation so we allow to reconfigure + # capabilities in the mssql factory + caps.escape_identifier = escape_postgres_identifier + caps.escape_literal = escape_mssql_literal + caps.has_case_sensitive_identifiers = False + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + # https://learn.microsoft.com/en-us/sql/sql-server/maximum-capacity-specifications-for-sql-server?view=sql-server-ver16&redirectedfrom=MSDN + caps.max_identifier_length = 128 + caps.max_column_identifier_length = 128 + # A SQL Query can be a varchar(max) but is shown as limited to 65,536 * Network Packet + caps.max_query_length = 65536 * 10 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 2**30 - 1 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = True + caps.max_rows_per_insert = 1000 + caps.timestamp_precision = 7 + + return caps @property - def client_class(self) -> t.Type["MsSqlClient"]: - from dlt.destinations.impl.mssql.mssql import MsSqlClient + def client_class(self) -> t.Type["MsSqlJobClient"]: + from dlt.destinations.impl.mssql.mssql import MsSqlJobClient - return MsSqlClient + return MsSqlJobClient def __init__( self, credentials: t.Union[MsSqlCredentials, t.Dict[str, t.Any], str] = None, - create_indexes: bool = True, + create_indexes: bool = False, + has_case_sensitive_identifiers: bool = False, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -37,12 +65,27 @@ def __init__( credentials: Credentials to connect to the mssql database. Can be an instance of `MsSqlCredentials` or a connection string in the format `mssql://user:password@host:port/database` create_indexes: Should unique indexes be created + has_case_sensitive_identifiers: Are identifiers used by mssql database case sensitive (following the collation) **kwargs: Additional arguments passed to the destination config """ super().__init__( credentials=credentials, create_indexes=create_indexes, + has_case_sensitive_identifiers=has_case_sensitive_identifiers, destination_name=destination_name, environment=environment, **kwargs, ) + + @classmethod + def adjust_capabilities( + cls, + caps: DestinationCapabilitiesContext, + config: MsSqlClientConfiguration, + naming: t.Optional[NamingConvention], + ) -> DestinationCapabilitiesContext: + # modify the caps if case sensitive identifiers are requested + if config.has_case_sensitive_identifiers: + caps.has_case_sensitive_identifiers = True + caps.casefold_identifier = str + return super().adjust_capabilities(caps, config, naming) diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index 6f364c8af1..25aab5c52a 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -1,19 +1,15 @@ -from typing import ClassVar, Dict, Optional, Sequence, List, Any, Tuple +from typing import Dict, Optional, Sequence, List, Any from dlt.common.exceptions import TerminalValueError -from dlt.common.wei import EVM_DECIMAL_PRECISION from dlt.common.destination.reference import NewLoadJob from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, TColumnHint, Schema from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat -from dlt.common.utils import uniq_id from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlMergeJob, SqlJobParams from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.impl.mssql import capabilities from dlt.destinations.impl.mssql.sql_client import PyOdbcMsSqlClient from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration from dlt.destinations.sql_client import SqlClientBase @@ -145,11 +141,16 @@ def _new_temp_table_name(cls, name_prefix: str, sql_client: SqlClientBase[Any]) return "#" + name -class MsSqlClient(InsertValuesJobClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: MsSqlClientConfiguration) -> None: - sql_client = PyOdbcMsSqlClient(config.normalize_dataset_name(schema), config.credentials) +class MsSqlJobClient(InsertValuesJobClient): + def __init__( + self, + schema: Schema, + config: MsSqlClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + sql_client = PyOdbcMsSqlClient( + config.normalize_dataset_name(schema), config.credentials, capabilities + ) super().__init__(schema, config, sql_client) self.config: MsSqlClientConfiguration = config self.sql_client = sql_client @@ -180,7 +181,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non for h in self.active_hints.keys() if c.get(h, False) is True ) - column_name = self.capabilities.escape_identifier(c["name"]) + column_name = self.sql_client.escape_column_name(c["name"]) return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c.get('nullable', True))}" def _create_replace_followup_jobs( diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py index db043bae25..a360670e77 100644 --- a/dlt/destinations/impl/mssql/sql_client.py +++ b/dlt/destinations/impl/mssql/sql_client.py @@ -1,4 +1,3 @@ -import platform import struct from datetime import datetime, timedelta, timezone # noqa: I251 @@ -23,7 +22,6 @@ ) from dlt.destinations.impl.mssql.configuration import MsSqlCredentials -from dlt.destinations.impl.mssql import capabilities def handle_datetimeoffset(dto_value: bytes) -> datetime: @@ -43,10 +41,14 @@ def handle_datetimeoffset(dto_value: bytes) -> datetime: class PyOdbcMsSqlClient(SqlClientBase[pyodbc.Connection], DBTransaction): dbapi: ClassVar[DBApi] = pyodbc - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def __init__(self, dataset_name: str, credentials: MsSqlCredentials) -> None: - super().__init__(credentials.database, dataset_name) + def __init__( + self, + dataset_name: str, + credentials: MsSqlCredentials, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(credentials.database, dataset_name, capabilities) self._conn: pyodbc.Connection = None self.credentials = credentials @@ -104,14 +106,14 @@ def drop_dataset(self) -> None: # Drop all views rows = self.execute_sql( "SELECT table_name FROM information_schema.views WHERE table_schema = %s;", - self.dataset_name, + self.capabilities.casefold_identifier(self.dataset_name), ) view_names = [row[0] for row in rows] self._drop_views(*view_names) # Drop all tables rows = self.execute_sql( "SELECT table_name FROM information_schema.tables WHERE table_schema = %s;", - self.dataset_name, + self.capabilities.casefold_identifier(self.dataset_name), ) table_names = [row[0] for row in rows] self.drop_tables(*table_names) @@ -158,11 +160,6 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB except pyodbc.Error as outer: raise outer - def fully_qualified_dataset_name(self, escape: bool = True) -> str: - return ( - self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name - ) - @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: if isinstance(ex, pyodbc.ProgrammingError): diff --git a/dlt/destinations/impl/postgres/__init__.py b/dlt/destinations/impl/postgres/__init__.py index bdb9297210..e69de29bb2 100644 --- a/dlt/destinations/impl/postgres/__init__.py +++ b/dlt/destinations/impl/postgres/__init__.py @@ -1,27 +0,0 @@ -from dlt.common.data_writers.escape import escape_postgres_identifier, escape_postgres_literal -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.common.wei import EVM_DECIMAL_PRECISION - - -def capabilities() -> DestinationCapabilitiesContext: - # https://www.postgresql.org/docs/current/limits.html - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "insert_values" - caps.supported_loader_file_formats = ["insert_values", "csv"] - caps.preferred_staging_file_format = None - caps.supported_staging_file_formats = [] - caps.escape_identifier = escape_postgres_identifier - caps.escape_literal = escape_postgres_literal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (2 * EVM_DECIMAL_PRECISION, EVM_DECIMAL_PRECISION) - caps.max_identifier_length = 63 - caps.max_column_identifier_length = 63 - caps.max_query_length = 32 * 1024 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 1024 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = True - - return caps diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index ae0b5200b2..13bdc7f6b2 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -1,6 +1,7 @@ import dataclasses -from typing import Dict, Final, ClassVar, Any, List, TYPE_CHECKING, Union +from typing import Dict, Final, ClassVar, Any, List, Optional +from dlt.common.data_writers.configuration import CsvFormatConfiguration from dlt.common.libs.sql_alchemy import URL from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials @@ -37,6 +38,9 @@ class PostgresClientConfiguration(DestinationClientDwhWithStagingConfiguration): create_indexes: bool = True + csv_format: Optional[CsvFormatConfiguration] = None + """Optional csv format configuration""" + def fingerprint(self) -> str: """Returns a fingerprint of host part of a connection string""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/postgres/factory.py b/dlt/destinations/impl/postgres/factory.py index 68d72f890a..b873bf97d5 100644 --- a/dlt/destinations/impl/postgres/factory.py +++ b/dlt/destinations/impl/postgres/factory.py @@ -1,12 +1,15 @@ import typing as t +from dlt.common.data_writers.configuration import CsvFormatConfiguration from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.data_writers.escape import escape_postgres_identifier, escape_postgres_literal +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE +from dlt.common.wei import EVM_DECIMAL_PRECISION from dlt.destinations.impl.postgres.configuration import ( PostgresCredentials, PostgresClientConfiguration, ) -from dlt.destinations.impl.postgres import capabilities if t.TYPE_CHECKING: from dlt.destinations.impl.postgres.postgres import PostgresClient @@ -15,8 +18,32 @@ class postgres(Destination[PostgresClientConfiguration, "PostgresClient"]): spec = PostgresClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + # https://www.postgresql.org/docs/current/limits.html + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "insert_values" + caps.supported_loader_file_formats = ["insert_values", "csv"] + caps.preferred_staging_file_format = None + caps.supported_staging_file_formats = [] + caps.escape_identifier = escape_postgres_identifier + # postgres has case sensitive identifiers but by default + # it folds them to lower case which makes them case insensitive + # https://stackoverflow.com/questions/20878932/are-postgresql-column-names-case-sensitive + caps.casefold_identifier = str.lower + caps.has_case_sensitive_identifiers = True + caps.escape_literal = escape_postgres_literal + caps.has_case_sensitive_identifiers = True + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (2 * EVM_DECIMAL_PRECISION, EVM_DECIMAL_PRECISION) + caps.max_identifier_length = 63 + caps.max_column_identifier_length = 63 + caps.max_query_length = 32 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 1024 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = True + + return caps @property def client_class(self) -> t.Type["PostgresClient"]: @@ -28,6 +55,7 @@ def __init__( self, credentials: t.Union[PostgresCredentials, t.Dict[str, t.Any], str] = None, create_indexes: bool = True, + csv_format: t.Optional[CsvFormatConfiguration] = None, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -40,11 +68,13 @@ def __init__( credentials: Credentials to connect to the postgres database. Can be an instance of `PostgresCredentials` or a connection string in the format `postgres://user:password@host:port/database` create_indexes: Should unique indexes be created + csv_format: Formatting options for csv file format **kwargs: Additional arguments passed to the destination config """ super().__init__( credentials=credentials, create_indexes=create_indexes, + csv_format=csv_format, destination_name=destination_name, environment=environment, **kwargs, diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py index 11cee208b1..7b173a7711 100644 --- a/dlt/destinations/impl/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -1,5 +1,11 @@ -from typing import ClassVar, Dict, Optional, Sequence, List, Any - +from typing import Dict, Optional, Sequence, List, Any + +from dlt.common import logger +from dlt.common.data_writers.configuration import CsvFormatConfiguration +from dlt.common.destination.exceptions import ( + DestinationInvalidFileFormat, + DestinationTerminalException, +) from dlt.common.destination.reference import FollowupJob, LoadJob, NewLoadJob, TLoadJobState from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.exceptions import TerminalValueError @@ -9,7 +15,6 @@ from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.impl.postgres import capabilities from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient from dlt.destinations.impl.postgres.configuration import PostgresClientConfiguration from dlt.destinations.sql_client import SqlClientBase @@ -106,21 +111,85 @@ def generate_sql( class PostgresCsvCopyJob(LoadJob, FollowupJob): - def __init__(self, table_name: str, file_path: str, sql_client: Psycopg2SqlClient) -> None: + def __init__(self, table: TTableSchema, file_path: str, client: "PostgresClient") -> None: super().__init__(FileStorage.get_file_name_from_file_path(file_path)) + config = client.config + sql_client = client.sql_client + csv_format = config.csv_format or CsvFormatConfiguration() + table_name = table["name"] + sep = csv_format.delimiter + if csv_format.on_error_continue: + logger.warning( + f"When processing {file_path} on table {table_name} Postgres csv reader does not" + " support on_error_continue" + ) with FileStorage.open_zipsafe_ro(file_path, "rb") as f: - # all headers in first line - headers = f.readline().decode("utf-8").strip() - # quote headers if not quoted - all special keywords like "binary" must be quoted - headers = ",".join(h if h.startswith('"') else f'"{h}"' for h in headers.split(",")) + if csv_format.include_header: + # all headers in first line + headers_row: str = f.readline().decode(csv_format.encoding).strip() + split_headers = headers_row.split(sep) + else: + # read first row to figure out the headers + split_first_row: str = f.readline().decode(csv_format.encoding).strip().split(sep) + split_headers = list(client.schema.get_table_columns(table_name).keys()) + if len(split_first_row) > len(split_headers): + raise DestinationInvalidFileFormat( + "postgres", + "csv", + file_path, + f"First row {split_first_row} has more rows than columns {split_headers} in" + f" table {table_name}", + ) + if len(split_first_row) < len(split_headers): + logger.warning( + f"First row {split_first_row} has less rows than columns {split_headers} in" + f" table {table_name}. We will not load data to superfluous columns." + ) + split_headers = split_headers[: len(split_first_row)] + # stream the first row again + f.seek(0) + + # normalized and quoted headers + split_headers = [ + sql_client.escape_column_name(h.strip('"'), escape=True) for h in split_headers + ] + split_null_headers = [] + split_columns = [] + # detect columns with NULL to use in FORCE NULL + # detect headers that are not in columns + for col in client.schema.get_table_columns(table_name).values(): + norm_col = sql_client.escape_column_name(col["name"], escape=True) + split_columns.append(norm_col) + if norm_col in split_headers and col.get("nullable", True): + split_null_headers.append(norm_col) + split_unknown_headers = set(split_headers).difference(split_columns) + if split_unknown_headers: + raise DestinationInvalidFileFormat( + "postgres", + "csv", + file_path, + f"Following headers {split_unknown_headers} cannot be matched to columns" + f" {split_columns} of table {table_name}.", + ) + + # use comma to join + headers = ",".join(split_headers) + if split_null_headers: + null_headers = f"FORCE_NULL({','.join(split_null_headers)})," + else: + null_headers = "" + qualified_table_name = sql_client.make_qualified_table_name(table_name) copy_sql = ( - "COPY %s (%s) FROM STDIN WITH (FORMAT CSV, DELIMITER ',', NULL '', FORCE_NULL(%s))" + "COPY %s (%s) FROM STDIN WITH (FORMAT CSV, DELIMITER '%s', NULL ''," + " %s ENCODING '%s')" % ( qualified_table_name, headers, - headers, + sep, + null_headers, + csv_format.encoding, ) ) with sql_client.begin_transaction(): @@ -135,10 +204,15 @@ def exception(self) -> str: class PostgresClient(InsertValuesJobClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: PostgresClientConfiguration) -> None: - sql_client = Psycopg2SqlClient(config.normalize_dataset_name(schema), config.credentials) + def __init__( + self, + schema: Schema, + config: PostgresClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + sql_client = Psycopg2SqlClient( + config.normalize_dataset_name(schema), config.credentials, capabilities + ) super().__init__(schema, config, sql_client) self.config: PostgresClientConfiguration = config self.sql_client: Psycopg2SqlClient = sql_client @@ -148,7 +222,7 @@ def __init__(self, schema: Schema, config: PostgresClientConfiguration) -> None: def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: job = super().start_file_load(table, file_path, load_id) if not job and file_path.endswith("csv"): - job = PostgresCsvCopyJob(table["name"], file_path, self.sql_client) + job = PostgresCsvCopyJob(table, file_path, self) return job def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: @@ -157,7 +231,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non for h in self.active_hints.keys() if c.get(h, False) is True ) - column_name = self.capabilities.escape_identifier(c["name"]) + column_name = self.sql_client.escape_column_name(c["name"]) return ( f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" ) diff --git a/dlt/destinations/impl/postgres/sql_client.py b/dlt/destinations/impl/postgres/sql_client.py index 366ed243ef..38bfc212d5 100644 --- a/dlt/destinations/impl/postgres/sql_client.py +++ b/dlt/destinations/impl/postgres/sql_client.py @@ -26,15 +26,18 @@ ) from dlt.destinations.impl.postgres.configuration import PostgresCredentials -from dlt.destinations.impl.postgres import capabilities class Psycopg2SqlClient(SqlClientBase["psycopg2.connection"], DBTransaction): dbapi: ClassVar[DBApi] = psycopg2 - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def __init__(self, dataset_name: str, credentials: PostgresCredentials) -> None: - super().__init__(credentials.database, dataset_name) + def __init__( + self, + dataset_name: str, + credentials: PostgresCredentials, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(credentials.database, dataset_name, capabilities) self._conn: psycopg2.connection = None self.credentials = credentials @@ -112,11 +115,6 @@ def execute_fragments( composed = Composed(sql if isinstance(sql, Composable) else SQL(sql) for sql in fragments) return self.execute_sql(composed, *args, **kwargs) - def fully_qualified_dataset_name(self, escape: bool = True) -> str: - return ( - self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name - ) - def _reset_connection(self) -> None: # self._conn.autocommit = True self._conn.reset() diff --git a/dlt/destinations/impl/qdrant/__init__.py b/dlt/destinations/impl/qdrant/__init__.py index 1a2c466b14..e69de29bb2 100644 --- a/dlt/destinations/impl/qdrant/__init__.py +++ b/dlt/destinations/impl/qdrant/__init__.py @@ -1,18 +0,0 @@ -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["jsonl"] - - caps.max_identifier_length = 200 - caps.max_column_identifier_length = 1024 - caps.max_query_length = 8 * 1024 * 1024 - caps.is_max_query_length_in_bytes = False - caps.max_text_data_type_length = 8 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = False - caps.supports_ddl_transactions = False - - return caps diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index fd11cc7dcb..4d1ed1234d 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -18,6 +18,8 @@ class QdrantCredentials(CredentialsConfiguration): location: Optional[str] = None # API key for authentication in Qdrant Cloud. Default: `None` api_key: Optional[str] = None + # Persistence path for QdrantLocal. Default: `None` + path: Optional[str] = None def __str__(self) -> str: return self.location or "localhost" @@ -44,7 +46,7 @@ class QdrantClientOptions(BaseConfiguration): # Default: `None` host: Optional[str] = None # Persistence path for QdrantLocal. Default: `None` - path: Optional[str] = None + # path: Optional[str] = None @configspec diff --git a/dlt/destinations/impl/qdrant/factory.py b/dlt/destinations/impl/qdrant/factory.py index df9cd64871..defd29a03a 100644 --- a/dlt/destinations/impl/qdrant/factory.py +++ b/dlt/destinations/impl/qdrant/factory.py @@ -3,7 +3,6 @@ from dlt.common.destination import Destination, DestinationCapabilitiesContext from dlt.destinations.impl.qdrant.configuration import QdrantCredentials, QdrantClientConfiguration -from dlt.destinations.impl.qdrant import capabilities if t.TYPE_CHECKING: from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient @@ -12,8 +11,20 @@ class qdrant(Destination[QdrantClientConfiguration, "QdrantClient"]): spec = QdrantClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl"] + caps.has_case_sensitive_identifiers = True + caps.max_identifier_length = 200 + caps.max_column_identifier_length = 1024 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 8 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = False + + return caps @property def client_class(self) -> t.Type["QdrantClient"]: diff --git a/dlt/destinations/impl/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py index 9898b28c86..51915c5536 100644 --- a/dlt/destinations/impl/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -1,19 +1,25 @@ from types import TracebackType -from typing import ClassVar, Optional, Sequence, List, Dict, Type, Iterable, Any, IO +from typing import Optional, Sequence, List, Dict, Type, Iterable, Any from dlt.common import logger from dlt.common.json import json from dlt.common.pendulum import pendulum from dlt.common.schema import Schema, TTableSchema, TSchemaTables -from dlt.common.schema.utils import get_columns_names_with_prop +from dlt.common.schema.utils import ( + get_columns_names_with_prop, + loads_table, + normalize_table_identifiers, + version_table, +) from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import TLoadJobState, LoadJob, JobClientBase, WithStateSync from dlt.common.storages import FileStorage +from dlt.common.time import precise_time from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.job_client_impl import StorageSchemaInfo, StateInfo -from dlt.destinations.impl.qdrant import capabilities +from dlt.destinations.utils import get_pipeline_state_query_columns from dlt.destinations.impl.qdrant.configuration import QdrantClientConfiguration from dlt.destinations.impl.qdrant.qdrant_adapter import VECTORIZE_HINT @@ -49,21 +55,24 @@ def __init__( if self.unique_identifiers else uuid.uuid4() ) - embedding_doc = self._get_embedding_doc(data) payloads.append(data) ids.append(point_id) - docs.append(embedding_doc) - - embedding_model = db_client._get_or_init_model(db_client.embedding_model_name) - embeddings = list( - embedding_model.embed( - docs, - batch_size=self.config.embedding_batch_size, - parallel=self.config.embedding_parallelism, + if len(self.embedding_fields) > 0: + docs.append(self._get_embedding_doc(data)) + + if len(self.embedding_fields) > 0: + embedding_model = db_client._get_or_init_model(db_client.embedding_model_name) + embeddings = list( + embedding_model.embed( + docs, + batch_size=self.config.embedding_batch_size, + parallel=self.config.embedding_parallelism, + ) ) - ) - vector_name = db_client.get_vector_field_name() - embeddings = [{vector_name: embedding.tolist()} for embedding in embeddings] + vector_name = db_client.get_vector_field_name() + embeddings = [{vector_name: embedding.tolist()} for embedding in embeddings] + else: + embeddings = [{}] * len(ids) assert len(embeddings) == len(payloads) == len(ids) self._upload_data(vectors=embeddings, ids=ids, payloads=payloads) @@ -126,7 +135,7 @@ def _generate_uuid( collection_name (str): Qdrant collection name. Returns: - str: A string representation of the genrated UUID + str: A string representation of the generated UUID """ data_id = "_".join(str(data[key]) for key in unique_identifiers) return str(uuid.uuid5(uuid.NAMESPACE_DNS, collection_name + data_id)) @@ -141,20 +150,25 @@ def exception(self) -> str: class QdrantClient(JobClientBase, WithStateSync): """Qdrant Destination Handler""" - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - state_properties: ClassVar[List[str]] = [ - "version", - "engine_version", - "pipeline_name", - "state", - "created_at", - "_dlt_load_id", - ] - - def __init__(self, schema: Schema, config: QdrantClientConfiguration) -> None: - super().__init__(schema, config) + def __init__( + self, + schema: Schema, + config: QdrantClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(schema, config, capabilities) + # get definitions of the dlt tables, normalize column names and keep for later use + version_table_ = normalize_table_identifiers(version_table(), schema.naming) + self.version_collection_properties = list(version_table_["columns"].keys()) + loads_table_ = normalize_table_identifiers(loads_table(), schema.naming) + self.loads_collection_properties = list(loads_table_["columns"].keys()) + state_table_ = normalize_table_identifiers( + get_pipeline_state_query_columns(), schema.naming + ) + self.pipeline_state_properties = list(state_table_["columns"].keys()) + self.config: QdrantClientConfiguration = config - self.db_client: QC = QdrantClient._create_db_client(config) + self.db_client: QC = None self.model = config.model @property @@ -216,19 +230,24 @@ def _create_collection(self, full_collection_name: str) -> None: self.db_client.create_collection( collection_name=full_collection_name, vectors_config=vectors_config ) + # TODO: we can use index hints to create indexes on properties or full text + # self.db_client.create_payload_index(full_collection_name, "_dlt_load_id", field_type="float") - def _create_point(self, obj: Dict[str, Any], collection_name: str) -> None: + def _create_point_no_vector(self, obj: Dict[str, Any], collection_name: str) -> None: """Inserts a point into a Qdrant collection without a vector. Args: obj (Dict[str, Any]): The arbitrary data to be inserted as payload. collection_name (str): The name of the collection to insert the point into. """ + # we want decreased ids because the point scroll functions orders by id ASC + # so we want newest first + id_ = 2**64 - int(precise_time() * 10**6) self.db_client.upsert( collection_name, points=[ models.PointStruct( - id=str(uuid.uuid4()), + id=id_, payload=obj, vector={}, ) @@ -308,7 +327,13 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: """Loads compressed state from destination storage By finding a load id that was completed """ - limit = 10 + # normalize property names + p_load_id = self.schema.naming.normalize_identifier("load_id") + p_dlt_load_id = self.schema.naming.normalize_identifier("_dlt_load_id") + p_pipeline_name = self.schema.naming.normalize_identifier("pipeline_name") + # p_created_at = self.schema.naming.normalize_identifier("created_at") + + limit = 100 offset = None while True: try: @@ -317,22 +342,28 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: ) state_records, offset = self.db_client.scroll( scroll_table_name, - with_payload=self.state_properties, + with_payload=self.pipeline_state_properties, scroll_filter=models.Filter( must=[ models.FieldCondition( - key="pipeline_name", match=models.MatchValue(value=pipeline_name) + key=p_pipeline_name, match=models.MatchValue(value=pipeline_name) ) ] ), + # search by package load id which is guaranteed to increase over time + # order_by=models.OrderBy( + # key=p_created_at, + # # direction=models.Direction.DESC, + # ), limit=limit, offset=offset, ) + # print("state_r", state_records) if len(state_records) == 0: return None for state_record in state_records: state = state_record.payload - load_id = state["_dlt_load_id"] + load_id = state[p_dlt_load_id] scroll_table_name = self._make_qualified_collection_name( self.schema.loads_table_name ) @@ -342,13 +373,12 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: count_filter=models.Filter( must=[ models.FieldCondition( - key="load_id", match=models.MatchValue(value=load_id) + key=p_load_id, match=models.MatchValue(value=load_id) ) ] ), ) if load_records.count > 0: - state["dlt_load_id"] = state.pop("_dlt_load_id") return StateInfo(**state) except Exception: return None @@ -357,18 +387,28 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]: """Retrieves newest schema from destination storage""" try: scroll_table_name = self._make_qualified_collection_name(self.schema.version_table_name) + p_schema_name = self.schema.naming.normalize_identifier("schema_name") + # this works only because we create points that have no vectors + # with decreasing ids. so newest (lowest ids) go first + # we do not use order_by because it requires and index to be created + # and this behavior is different for local and cloud qdrant + # p_inserted_at = self.schema.naming.normalize_identifier("inserted_at") response = self.db_client.scroll( scroll_table_name, with_payload=True, scroll_filter=models.Filter( must=[ models.FieldCondition( - key="schema_name", + key=p_schema_name, match=models.MatchValue(value=self.schema.name), ) ] ), limit=1, + # order_by=models.OrderBy( + # key=p_inserted_at, + # direction=models.Direction.DESC, + # ) ) record = response[0][0].payload return StorageSchemaInfo(**record) @@ -378,13 +418,14 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]: def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]: try: scroll_table_name = self._make_qualified_collection_name(self.schema.version_table_name) + p_version_hash = self.schema.naming.normalize_identifier("version_hash") response = self.db_client.scroll( scroll_table_name, with_payload=True, scroll_filter=models.Filter( must=[ models.FieldCondition( - key="version_hash", match=models.MatchValue(value=schema_hash) + key=p_version_hash, match=models.MatchValue(value=schema_hash) ) ] ), @@ -408,16 +449,14 @@ def restore_file_load(self, file_path: str) -> LoadJob: return EmptyLoadJob.from_file_path(file_path, "completed") def complete_load(self, load_id: str) -> None: - properties = { - "load_id": load_id, - "schema_name": self.schema.name, - "status": 0, - "inserted_at": str(pendulum.now()), - } + values = [load_id, self.schema.name, 0, str(pendulum.now()), self.schema.version_hash] + assert len(values) == len(self.loads_collection_properties) + properties = {k: v for k, v in zip(self.loads_collection_properties, values)} loads_table_name = self._make_qualified_collection_name(self.schema.loads_table_name) - self._create_point(properties, loads_table_name) + self._create_point_no_vector(properties, loads_table_name) def __enter__(self) -> "QdrantClient": + self.db_client = QdrantClient._create_db_client(self.config) return self def __exit__( @@ -426,20 +465,24 @@ def __exit__( exc_val: BaseException, exc_tb: TracebackType, ) -> None: - pass + if self.db_client: + self.db_client.close() + self.db_client = None def _update_schema_in_storage(self, schema: Schema) -> None: schema_str = json.dumps(schema.to_dict()) - properties = { - "version_hash": schema.stored_version_hash, - "schema_name": schema.name, - "version": schema.version, - "engine_version": schema.ENGINE_VERSION, - "inserted_at": str(pendulum.now()), - "schema": schema_str, - } + values = [ + schema.version, + schema.ENGINE_VERSION, + str(pendulum.now().isoformat()), + schema.name, + schema.stored_version_hash, + schema_str, + ] + assert len(values) == len(self.version_collection_properties) + properties = {k: v for k, v in zip(self.version_collection_properties, values)} version_table_name = self._make_qualified_collection_name(self.schema.version_table_name) - self._create_point(properties, version_table_name) + self._create_point_no_vector(properties, version_table_name) def _execute_schema_update(self, only_tables: Iterable[str]) -> None: for table_name in only_tables or self.schema.tables: @@ -460,6 +503,10 @@ def _collection_exists(self, table_name: str, qualify_table_name: bool = True) - ) self.db_client.get_collection(table_name) return True + except ValueError as e: + if "not found" in str(e): + return False + raise e except UnexpectedResponse as e: if e.status_code == 404: return False diff --git a/dlt/destinations/impl/redshift/__init__.py b/dlt/destinations/impl/redshift/__init__.py index 8a8cae84b4..e69de29bb2 100644 --- a/dlt/destinations/impl/redshift/__init__.py +++ b/dlt/destinations/impl/redshift/__init__.py @@ -1,25 +0,0 @@ -from dlt.common.data_writers.escape import escape_redshift_identifier, escape_redshift_literal -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "insert_values" - caps.supported_loader_file_formats = ["insert_values"] - caps.preferred_staging_file_format = "jsonl" - caps.supported_staging_file_formats = ["jsonl", "parquet"] - caps.escape_identifier = escape_redshift_identifier - caps.escape_literal = escape_redshift_literal - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - caps.max_identifier_length = 127 - caps.max_column_identifier_length = 127 - caps.max_query_length = 16 * 1024 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 65535 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = True - caps.alter_add_multi_column = False - - return caps diff --git a/dlt/destinations/impl/redshift/configuration.py b/dlt/destinations/impl/redshift/configuration.py index 72d7f70a9f..3b84c8663e 100644 --- a/dlt/destinations/impl/redshift/configuration.py +++ b/dlt/destinations/impl/redshift/configuration.py @@ -23,7 +23,9 @@ class RedshiftCredentials(PostgresCredentials): class RedshiftClientConfiguration(PostgresClientConfiguration): destination_type: Final[str] = dataclasses.field(default="redshift", init=False, repr=False, compare=False) # type: ignore credentials: RedshiftCredentials = None + staging_iam_role: Optional[str] = None + has_case_sensitive_identifiers: bool = False def fingerprint(self) -> str: """Returns a fingerprint of host part of a connection string""" diff --git a/dlt/destinations/impl/redshift/factory.py b/dlt/destinations/impl/redshift/factory.py index d80ef9dcad..7e6638be1e 100644 --- a/dlt/destinations/impl/redshift/factory.py +++ b/dlt/destinations/impl/redshift/factory.py @@ -1,12 +1,14 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.data_writers.escape import escape_redshift_identifier, escape_redshift_literal +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE +from dlt.common.normalizers.naming import NamingConvention from dlt.destinations.impl.redshift.configuration import ( RedshiftCredentials, RedshiftClientConfiguration, ) -from dlt.destinations.impl.redshift import capabilities if t.TYPE_CHECKING: from dlt.destinations.impl.redshift.redshift import RedshiftClient @@ -15,8 +17,31 @@ class redshift(Destination[RedshiftClientConfiguration, "RedshiftClient"]): spec = RedshiftClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "insert_values" + caps.supported_loader_file_formats = ["insert_values"] + caps.preferred_staging_file_format = "jsonl" + caps.supported_staging_file_formats = ["jsonl", "parquet"] + # redshift is case insensitive and will lower case identifiers when stored + # you can enable case sensitivity https://docs.aws.amazon.com/redshift/latest/dg/r_enable_case_sensitive_identifier.html + # then redshift behaves like postgres + caps.escape_identifier = escape_redshift_identifier + caps.escape_literal = escape_redshift_literal + caps.casefold_identifier = str.lower + caps.has_case_sensitive_identifiers = False + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + caps.max_identifier_length = 127 + caps.max_column_identifier_length = 127 + caps.max_query_length = 16 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 65535 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = True + caps.alter_add_multi_column = False + + return caps @property def client_class(self) -> t.Type["RedshiftClient"]: @@ -27,8 +52,8 @@ def client_class(self) -> t.Type["RedshiftClient"]: def __init__( self, credentials: t.Union[RedshiftCredentials, t.Dict[str, t.Any], str] = None, - create_indexes: bool = True, staging_iam_role: t.Optional[str] = None, + has_case_sensitive_identifiers: bool = False, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -40,15 +65,28 @@ def __init__( Args: credentials: Credentials to connect to the redshift database. Can be an instance of `RedshiftCredentials` or a connection string in the format `redshift://user:password@host:port/database` - create_indexes: Should unique indexes be created staging_iam_role: IAM role to use for staging data in S3 + has_case_sensitive_identifiers: Are case sensitive identifiers enabled for a database **kwargs: Additional arguments passed to the destination config """ super().__init__( credentials=credentials, - create_indexes=create_indexes, staging_iam_role=staging_iam_role, + has_case_sensitive_identifiers=has_case_sensitive_identifiers, destination_name=destination_name, environment=environment, **kwargs, ) + + @classmethod + def adjust_capabilities( + cls, + caps: DestinationCapabilitiesContext, + config: RedshiftClientConfiguration, + naming: t.Optional[NamingConvention], + ) -> DestinationCapabilitiesContext: + # modify the caps if case sensitive identifiers are requested + if config.has_case_sensitive_identifiers: + caps.has_case_sensitive_identifiers = True + caps.casefold_identifier = str + return super().adjust_capabilities(caps, config, naming) diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py index 672fceb7b2..faa037078a 100644 --- a/dlt/destinations/impl/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -1,11 +1,6 @@ import platform import os -from dlt.common.exceptions import TerminalValueError -from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient - -from dlt.common.schema.utils import table_schema_has_type, table_schema_has_type_with_precision - if platform.python_implementation() == "PyPy": import psycopg2cffi as psycopg2 @@ -15,25 +10,27 @@ # from psycopg2.sql import SQL, Composed -from typing import ClassVar, Dict, List, Optional, Sequence, Any +from typing import Dict, List, Optional, Sequence, Any, Tuple + -from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( NewLoadJob, CredentialsConfiguration, SupportsStagingDestination, ) from dlt.common.data_types import TDataType +from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.schema import TColumnSchema, TColumnHint, Schema -from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat +from dlt.common.exceptions import TerminalValueError +from dlt.common.schema.utils import table_schema_has_type, table_schema_has_type_with_precision +from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat, TTableSchemaColumns from dlt.common.configuration.specs import AwsCredentialsWithoutDefaults from dlt.destinations.insert_job_client import InsertValuesJobClient from dlt.destinations.sql_jobs import SqlMergeJob from dlt.destinations.exceptions import DatabaseTerminalException, LoadJobTerminalException from dlt.destinations.job_client_impl import CopyRemoteFileLoadJob, LoadJob - -from dlt.destinations.impl.redshift import capabilities +from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase @@ -109,8 +106,6 @@ def from_db_type( class RedshiftSqlClient(Psycopg2SqlClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - @staticmethod def _maybe_make_terminal_exception_from_data_error( pg_ex: psycopg2.DataError, @@ -151,7 +146,6 @@ def execute(self, table: TTableSchema, bucket_path: str) -> None: "CREDENTIALS" f" 'aws_access_key_id={aws_access_key};aws_secret_access_key={aws_secret_key}'" ) - table_name = table["name"] # get format ext = os.path.splitext(bucket_path)[1][1:] @@ -191,10 +185,9 @@ def execute(self, table: TTableSchema, bucket_path: str) -> None: raise ValueError(f"Unsupported file type {ext} for Redshift.") with self._sql_client.begin_transaction(): - dataset_name = self._sql_client.dataset_name # TODO: if we ever support csv here remember to add column names to COPY self._sql_client.execute_sql(f""" - COPY {dataset_name}.{table_name} + COPY {self._sql_client.make_qualified_table_name(table['name'])} FROM '{bucket_path}' {file_type} {dateformat} @@ -231,10 +224,15 @@ def gen_key_table_clauses( class RedshiftClient(InsertValuesJobClient, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: RedshiftClientConfiguration) -> None: - sql_client = RedshiftSqlClient(config.normalize_dataset_name(schema), config.credentials) + def __init__( + self, + schema: Schema, + config: RedshiftClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + sql_client = RedshiftSqlClient( + config.normalize_dataset_name(schema), config.credentials, capabilities + ) super().__init__(schema, config, sql_client) self.sql_client = sql_client self.config: RedshiftClientConfiguration = config @@ -249,7 +247,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non for h in HINT_TO_REDSHIFT_ATTR.keys() if c.get(h, False) is True ) - column_name = self.capabilities.escape_identifier(c["name"]) + column_name = self.sql_client.escape_column_name(c["name"]) return ( f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" ) diff --git a/dlt/destinations/impl/snowflake/__init__.py b/dlt/destinations/impl/snowflake/__init__.py index dde4d5a382..e69de29bb2 100644 --- a/dlt/destinations/impl/snowflake/__init__.py +++ b/dlt/destinations/impl/snowflake/__init__.py @@ -1,25 +0,0 @@ -from dlt.common.data_writers.escape import escape_bigquery_identifier -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.data_writers.escape import escape_snowflake_identifier -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["jsonl", "parquet"] - caps.preferred_staging_file_format = "jsonl" - caps.supported_staging_file_formats = ["jsonl", "parquet"] - caps.escape_identifier = escape_snowflake_identifier - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - caps.max_identifier_length = 255 - caps.max_column_identifier_length = 255 - caps.max_query_length = 2 * 1024 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 16 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_ddl_transactions = True - caps.alter_add_multi_column = True - caps.supports_clone_table = True - return caps diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index 8529fbe5c8..1211b78672 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -1,8 +1,9 @@ import dataclasses import base64 -from typing import Final, Optional, Any, Dict, ClassVar, List, TYPE_CHECKING, Union +from typing import Final, Optional, Any, Dict, ClassVar, List from dlt import version +from dlt.common.data_writers.configuration import CsvFormatConfiguration from dlt.common.libs.sql_alchemy import URL from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import TSecretStrValue @@ -135,6 +136,9 @@ class SnowflakeClientConfiguration(DestinationClientDwhWithStagingConfiguration) keep_staged_files: bool = True """Whether to keep or delete the staged files after COPY INTO succeeds""" + csv_format: Optional[CsvFormatConfiguration] = None + """Optional csv format configuration""" + def fingerprint(self) -> str: """Returns a fingerprint of host part of a connection string""" if self.credentials and self.credentials.host: diff --git a/dlt/destinations/impl/snowflake/factory.py b/dlt/destinations/impl/snowflake/factory.py index c4459232b7..f531b8704e 100644 --- a/dlt/destinations/impl/snowflake/factory.py +++ b/dlt/destinations/impl/snowflake/factory.py @@ -1,11 +1,14 @@ import typing as t +from dlt.common.data_writers.configuration import CsvFormatConfiguration +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.data_writers.escape import escape_snowflake_identifier +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE + from dlt.destinations.impl.snowflake.configuration import ( SnowflakeCredentials, SnowflakeClientConfiguration, ) -from dlt.destinations.impl.snowflake import capabilities -from dlt.common.destination import Destination, DestinationCapabilitiesContext if t.TYPE_CHECKING: from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient @@ -14,8 +17,31 @@ class snowflake(Destination[SnowflakeClientConfiguration, "SnowflakeClient"]): spec = SnowflakeClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl", "parquet", "csv"] + caps.preferred_staging_file_format = "jsonl" + caps.supported_staging_file_formats = ["jsonl", "parquet", "csv"] + # snowflake is case sensitive but all unquoted identifiers are upper cased + # so upper case identifiers are considered case insensitive + caps.escape_identifier = escape_snowflake_identifier + # dlt is configured to create case insensitive identifiers + # note that case sensitive naming conventions will change this setting to "str" (case sensitive) + caps.casefold_identifier = str.upper + caps.has_case_sensitive_identifiers = True + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + caps.max_identifier_length = 255 + caps.max_column_identifier_length = 255 + caps.max_query_length = 2 * 1024 * 1024 + caps.is_max_query_length_in_bytes = True + caps.max_text_data_type_length = 16 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + caps.supports_ddl_transactions = True + caps.alter_add_multi_column = True + caps.supports_clone_table = True + return caps @property def client_class(self) -> t.Type["SnowflakeClient"]: @@ -28,6 +54,7 @@ def __init__( credentials: t.Union[SnowflakeCredentials, t.Dict[str, t.Any], str] = None, stage_name: t.Optional[str] = None, keep_staged_files: bool = True, + csv_format: t.Optional[CsvFormatConfiguration] = None, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -46,6 +73,7 @@ def __init__( credentials=credentials, stage_name=stage_name, keep_staged_files=keep_staged_files, + csv_format=csv_format, destination_name=destination_name, environment=environment, **kwargs, diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index 70377de709..2a5671b7e7 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -1,6 +1,7 @@ -from typing import ClassVar, Optional, Sequence, Tuple, List, Any +from typing import Optional, Sequence, List from urllib.parse import urlparse, urlunparse +from dlt.common.data_writers.configuration import CsvFormatConfiguration from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( FollowupJob, @@ -14,7 +15,6 @@ AwsCredentialsWithoutDefaults, AzureCredentialsWithoutDefaults, ) -from dlt.common.data_types import TDataType from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat @@ -24,13 +24,10 @@ from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.exceptions import LoadJobTerminalException -from dlt.destinations.impl.snowflake import capabilities from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient -from dlt.destinations.sql_jobs import SqlJobParams from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.job_impl import NewReferenceJob -from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper @@ -86,6 +83,7 @@ def __init__( table_name: str, load_id: str, client: SnowflakeSqlClient, + config: SnowflakeClientConfiguration, stage_name: Optional[str] = None, keep_staged_files: bool = True, staging_credentials: Optional[CredentialsConfiguration] = None, @@ -108,6 +106,14 @@ def __init__( credentials_clause = "" files_clause = "" stage_file_path = "" + on_error_clause = "" + + case_folding = ( + "CASE_SENSITIVE" + if client.capabilities.casefold_identifier is str + else "CASE_INSENSITIVE" + ) + column_match_clause = f"MATCH_BY_COLUMN_NAME='{case_folding}'" if bucket_path: bucket_url = urlparse(bucket_path) @@ -164,9 +170,28 @@ def __init__( from_clause = f"FROM {stage_file_path}" # decide on source format, stage_file_path will either be a local file or a bucket path - source_format = "( TYPE = 'JSON', BINARY_FORMAT = 'BASE64' )" + if file_name.endswith("jsonl"): + source_format = "( TYPE = 'JSON', BINARY_FORMAT = 'BASE64' )" if file_name.endswith("parquet"): - source_format = "(TYPE = 'PARQUET', BINARY_AS_TEXT = FALSE, USE_LOGICAL_TYPE = TRUE)" + source_format = ( + "(TYPE = 'PARQUET', BINARY_AS_TEXT = FALSE, USE_LOGICAL_TYPE = TRUE)" + # TODO: USE_VECTORIZED_SCANNER inserts null strings into VARIANT JSON + # " USE_VECTORIZED_SCANNER = TRUE)" + ) + if file_name.endswith("csv"): + # empty strings are NULL, no data is NULL, missing columns (ERROR_ON_COLUMN_COUNT_MISMATCH) are NULL + csv_format = config.csv_format or CsvFormatConfiguration() + source_format = ( + "(TYPE = 'CSV', BINARY_FORMAT = 'UTF-8', PARSE_HEADER =" + f" {csv_format.include_header}, FIELD_OPTIONALLY_ENCLOSED_BY = '\"', NULL_IF =" + " (''), ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE," + f" FIELD_DELIMITER='{csv_format.delimiter}', ENCODING='{csv_format.encoding}')" + ) + # disable column match if headers are not provided + if not csv_format.include_header: + column_match_clause = "" + if csv_format.on_error_continue: + on_error_clause = "ON_ERROR = CONTINUE" with client.begin_transaction(): # PUT and COPY in one tx if local file, otherwise only copy @@ -180,7 +205,8 @@ def __init__( {files_clause} {credentials_clause} FILE_FORMAT = {source_format} - MATCH_BY_COLUMN_NAME='CASE_INSENSITIVE' + {column_match_clause} + {on_error_clause} """) if stage_file_path and not keep_staged_files: client.execute_sql(f"REMOVE {stage_file_path}") @@ -193,10 +219,15 @@ def exception(self) -> str: class SnowflakeClient(SqlJobClientWithStaging, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: SnowflakeClientConfiguration) -> None: - sql_client = SnowflakeSqlClient(config.normalize_dataset_name(schema), config.credentials) + def __init__( + self, + schema: Schema, + config: SnowflakeClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + sql_client = SnowflakeSqlClient( + config.normalize_dataset_name(schema), config.credentials, capabilities + ) super().__init__(schema, config, sql_client) self.config: SnowflakeClientConfiguration = config self.sql_client: SnowflakeSqlClient = sql_client # type: ignore @@ -211,6 +242,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> table["name"], load_id, self.sql_client, + self.config, stage_name=self.config.stage_name, keep_staged_files=self.config.keep_staged_files, staging_credentials=( @@ -241,7 +273,7 @@ def _get_table_update_sql( sql = super()._get_table_update_sql(table_name, new_columns, generate_alter) cluster_list = [ - self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("cluster") + self.sql_client.escape_column_name(c["name"]) for c in new_columns if c.get("cluster") ] if cluster_list: @@ -255,17 +287,7 @@ def _from_db_type( return self.type_mapper.from_db_type(bq_t, precision, scale) def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: - name = self.capabilities.escape_identifier(c["name"]) + name = self.sql_client.escape_column_name(c["name"]) return ( f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" ) - - def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: - table_name = table_name.upper() # All snowflake tables are uppercased in information schema - exists, table = super().get_storage_table(table_name) - if not exists: - return exists, table - # Snowflake converts all unquoted columns to UPPER CASE - # Convert back to lower case to enable comparison with dlt schema - table = {col_name.lower(): dict(col, name=col_name.lower()) for col_name, col in table.items()} # type: ignore - return exists, table diff --git a/dlt/destinations/impl/snowflake/sql_client.py b/dlt/destinations/impl/snowflake/sql_client.py index 4a602ce0e8..e033a9f455 100644 --- a/dlt/destinations/impl/snowflake/sql_client.py +++ b/dlt/destinations/impl/snowflake/sql_client.py @@ -17,7 +17,6 @@ ) from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials -from dlt.destinations.impl.snowflake import capabilities class SnowflakeCursorImpl(DBApiCursorImpl): @@ -31,10 +30,14 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> Optional[DataFrame]: class SnowflakeSqlClient(SqlClientBase[snowflake_lib.SnowflakeConnection], DBTransaction): dbapi: ClassVar[DBApi] = snowflake_lib - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def __init__(self, dataset_name: str, credentials: SnowflakeCredentials) -> None: - super().__init__(credentials.database, dataset_name) + def __init__( + self, + dataset_name: str, + credentials: SnowflakeCredentials, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(credentials.database, dataset_name, capabilities) self._conn: snowflake_lib.SnowflakeConnection = None self.credentials = credentials @@ -112,12 +115,6 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB self.open_connection() raise outer - def fully_qualified_dataset_name(self, escape: bool = True) -> str: - # Always escape for uppercase - if escape: - return self.capabilities.escape_identifier(self.dataset_name) - return self.dataset_name.upper() - def _reset_connection(self) -> None: self._conn.rollback() self._conn.autocommit(True) diff --git a/dlt/destinations/impl/synapse/__init__.py b/dlt/destinations/impl/synapse/__init__.py index f6ad7369c1..e69de29bb2 100644 --- a/dlt/destinations/impl/synapse/__init__.py +++ b/dlt/destinations/impl/synapse/__init__.py @@ -1,54 +0,0 @@ -from dlt.common.data_writers.escape import escape_postgres_identifier, escape_mssql_literal -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.common.wei import EVM_DECIMAL_PRECISION - -from dlt.destinations.impl.synapse.synapse_adapter import synapse_adapter - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - - caps.preferred_loader_file_format = "insert_values" - caps.supported_loader_file_formats = ["insert_values"] - caps.preferred_staging_file_format = "parquet" - caps.supported_staging_file_formats = ["parquet"] - - caps.insert_values_writer_type = "select_union" # https://stackoverflow.com/a/77014299 - - caps.escape_identifier = escape_postgres_identifier - caps.escape_literal = escape_mssql_literal - - # Synapse has a max precision of 38 - # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#DataTypes - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - - # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#LimitationsRestrictions - caps.max_identifier_length = 128 - caps.max_column_identifier_length = 128 - - # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-service-capacity-limits#queries - caps.max_query_length = 65536 * 4096 - caps.is_max_query_length_in_bytes = True - - # nvarchar(max) can store 2 GB - # https://learn.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-ver16#nvarchar---n--max-- - caps.max_text_data_type_length = 2 * 1024 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = True - - # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-develop-transactions - caps.supports_transactions = True - caps.supports_ddl_transactions = False - - # Synapse throws "Some part of your SQL statement is nested too deeply. Rewrite the query or break it up into smaller queries." - # if number of records exceeds a certain number. Which exact number that is seems not deterministic: - # in tests, I've seen a query with 12230 records run succesfully on one run, but fail on a subsequent run, while the query remained exactly the same. - # 10.000 records is a "safe" amount that always seems to work. - caps.max_rows_per_insert = 10000 - - # datetimeoffset can store 7 digits for fractional seconds - # https://learn.microsoft.com/en-us/sql/t-sql/data-types/datetimeoffset-transact-sql?view=sql-server-ver16 - caps.timestamp_precision = 7 - - return caps diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index 100878ae05..4820056e66 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -1,8 +1,10 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.normalizers.naming import NamingConvention +from dlt.common.data_writers.escape import escape_postgres_identifier, escape_mssql_literal +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.impl.synapse import capabilities from dlt.destinations.impl.synapse.configuration import ( SynapseCredentials, SynapseClientConfiguration, @@ -21,8 +23,57 @@ class synapse(Destination[SynapseClientConfiguration, "SynapseClient"]): # def spec(self) -> t.Type[SynapseClientConfiguration]: # return SynapseClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + + caps.preferred_loader_file_format = "insert_values" + caps.supported_loader_file_formats = ["insert_values"] + caps.preferred_staging_file_format = "parquet" + caps.supported_staging_file_formats = ["parquet"] + + caps.insert_values_writer_type = "select_union" # https://stackoverflow.com/a/77014299 + + # similarly to mssql case sensitivity depends on database collation + # https://learn.microsoft.com/en-us/sql/relational-databases/collations/collation-and-unicode-support?view=sql-server-ver16#collations-in-azure-sql-database + # note that special option CATALOG_COLLATION is used to change it + caps.escape_identifier = escape_postgres_identifier + caps.escape_literal = escape_mssql_literal + # we allow to reconfigure capabilities in the mssql factory + caps.has_case_sensitive_identifiers = False + + # Synapse has a max precision of 38 + # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#DataTypes + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + + # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#LimitationsRestrictions + caps.max_identifier_length = 128 + caps.max_column_identifier_length = 128 + + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-service-capacity-limits#queries + caps.max_query_length = 65536 * 4096 + caps.is_max_query_length_in_bytes = True + + # nvarchar(max) can store 2 GB + # https://learn.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-ver16#nvarchar---n--max-- + caps.max_text_data_type_length = 2 * 1024 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-develop-transactions + caps.supports_transactions = True + caps.supports_ddl_transactions = False + + # Synapse throws "Some part of your SQL statement is nested too deeply. Rewrite the query or break it up into smaller queries." + # if number of records exceeds a certain number. Which exact number that is seems not deterministic: + # in tests, I've seen a query with 12230 records run succesfully on one run, but fail on a subsequent run, while the query remained exactly the same. + # 10.000 records is a "safe" amount that always seems to work. + caps.max_rows_per_insert = 10000 + + # datetimeoffset can store 7 digits for fractional seconds + # https://learn.microsoft.com/en-us/sql/t-sql/data-types/datetimeoffset-transact-sql?view=sql-server-ver16 + caps.timestamp_precision = 7 + + return caps @property def client_class(self) -> t.Type["SynapseClient"]: @@ -36,6 +87,7 @@ def __init__( default_table_index_type: t.Optional[TTableIndexType] = "heap", create_indexes: bool = False, staging_use_msi: bool = False, + has_case_sensitive_identifiers: bool = False, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -50,6 +102,7 @@ def __init__( default_table_index_type: Maps directly to the default_table_index_type attribute of the SynapseClientConfiguration object. create_indexes: Maps directly to the create_indexes attribute of the SynapseClientConfiguration object. staging_use_msi: Maps directly to the staging_use_msi attribute of the SynapseClientConfiguration object. + has_case_sensitive_identifiers: Are identifiers used by synapse database case sensitive (following the catalog collation) **kwargs: Additional arguments passed to the destination config """ super().__init__( @@ -57,7 +110,21 @@ def __init__( default_table_index_type=default_table_index_type, create_indexes=create_indexes, staging_use_msi=staging_use_msi, + has_case_sensitive_identifiers=has_case_sensitive_identifiers, destination_name=destination_name, environment=environment, **kwargs, ) + + @classmethod + def adjust_capabilities( + cls, + caps: DestinationCapabilitiesContext, + config: SynapseClientConfiguration, + naming: t.Optional[NamingConvention], + ) -> DestinationCapabilitiesContext: + # modify the caps if case sensitive identifiers are requested + if config.has_case_sensitive_identifiers: + caps.has_case_sensitive_identifiers = True + caps.casefold_identifier = str + return super().adjust_capabilities(caps, config, naming) diff --git a/dlt/destinations/impl/synapse/sql_client.py b/dlt/destinations/impl/synapse/sql_client.py index 089c58e57c..db1b3e7cf6 100644 --- a/dlt/destinations/impl/synapse/sql_client.py +++ b/dlt/destinations/impl/synapse/sql_client.py @@ -5,15 +5,12 @@ from dlt.destinations.impl.mssql.sql_client import PyOdbcMsSqlClient from dlt.destinations.impl.mssql.configuration import MsSqlCredentials -from dlt.destinations.impl.synapse import capabilities from dlt.destinations.impl.synapse.configuration import SynapseCredentials from dlt.destinations.exceptions import DatabaseUndefinedRelation class SynapseSqlClient(PyOdbcMsSqlClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - def drop_tables(self, *tables: str) -> None: if not tables: return diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index 48171ace4c..de2f9d4472 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -1,5 +1,5 @@ import os -from typing import ClassVar, Sequence, List, Dict, Any, Optional, cast, Union +from typing import Sequence, List, Dict, Any, Optional, cast, Union from copy import deepcopy from textwrap import dedent from urllib.parse import urlparse, urlunparse @@ -29,12 +29,11 @@ from dlt.destinations.impl.mssql.mssql import ( MsSqlTypeMapper, - MsSqlClient, + MsSqlJobClient, VARCHAR_MAX_N, VARBINARY_MAX_N, ) -from dlt.destinations.impl.synapse import capabilities from dlt.destinations.impl.synapse.sql_client import SynapseSqlClient from dlt.destinations.impl.synapse.configuration import SynapseClientConfiguration from dlt.destinations.impl.synapse.synapse_adapter import ( @@ -53,14 +52,17 @@ } -class SynapseClient(MsSqlClient, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - - def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None: - super().__init__(schema, config) +class SynapseClient(MsSqlJobClient, SupportsStagingDestination): + def __init__( + self, + schema: Schema, + config: SynapseClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(schema, config, capabilities) self.config: SynapseClientConfiguration = config self.sql_client = SynapseSqlClient( - config.normalize_dataset_name(schema), config.credentials + config.normalize_dataset_name(schema), config.credentials, capabilities ) self.active_hints = deepcopy(HINT_TO_SYNAPSE_ATTR) diff --git a/dlt/destinations/impl/weaviate/__init__.py b/dlt/destinations/impl/weaviate/__init__.py index 143e0260d2..e69de29bb2 100644 --- a/dlt/destinations/impl/weaviate/__init__.py +++ b/dlt/destinations/impl/weaviate/__init__.py @@ -1,19 +0,0 @@ -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.destinations.impl.weaviate.weaviate_adapter import weaviate_adapter - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["jsonl"] - - caps.max_identifier_length = 200 - caps.max_column_identifier_length = 1024 - caps.max_query_length = 8 * 1024 * 1024 - caps.is_max_query_length_in_bytes = False - caps.max_text_data_type_length = 8 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = False - caps.supports_ddl_transactions = False - caps.naming_convention = "dlt.destinations.impl.weaviate.naming" - - return caps diff --git a/dlt/destinations/impl/weaviate/ci_naming.py b/dlt/destinations/impl/weaviate/ci_naming.py index cc8936f42d..63c94776ad 100644 --- a/dlt/destinations/impl/weaviate/ci_naming.py +++ b/dlt/destinations/impl/weaviate/ci_naming.py @@ -2,6 +2,11 @@ class NamingConvention(WeaviateNamingConvention): + def __init__(self, max_length: int = None) -> None: + """Case insensitive naming convention for Weaviate. Lower cases all identifiers""" + super().__init__(max_length) + self.is_case_sensitive = False + def _lowercase_property(self, identifier: str) -> str: """Lowercase the whole property to become case insensitive""" return identifier.lower() diff --git a/dlt/destinations/impl/weaviate/exceptions.py b/dlt/destinations/impl/weaviate/exceptions.py index ee798e4e76..11e440a811 100644 --- a/dlt/destinations/impl/weaviate/exceptions.py +++ b/dlt/destinations/impl/weaviate/exceptions.py @@ -1,16 +1,16 @@ from dlt.common.destination.exceptions import DestinationException, DestinationTerminalException -class WeaviateBatchError(DestinationException): +class WeaviateGrpcError(DestinationException): pass class PropertyNameConflict(DestinationTerminalException): - def __init__(self) -> None: + def __init__(self, error: str) -> None: super().__init__( "Your data contains items with identical property names when compared case insensitive." " Weaviate cannot handle such data. Please clean up your data before loading or change" " to case insensitive naming convention. See" " https://dlthub.com/docs/dlt-ecosystem/destinations/weaviate#names-normalization for" - " details." + f" details. [{error}]" ) diff --git a/dlt/destinations/impl/weaviate/factory.py b/dlt/destinations/impl/weaviate/factory.py index 0449e6cdd5..3d78c9582a 100644 --- a/dlt/destinations/impl/weaviate/factory.py +++ b/dlt/destinations/impl/weaviate/factory.py @@ -6,7 +6,6 @@ WeaviateCredentials, WeaviateClientConfiguration, ) -from dlt.destinations.impl.weaviate import capabilities if t.TYPE_CHECKING: from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient @@ -15,8 +14,26 @@ class weaviate(Destination[WeaviateClientConfiguration, "WeaviateClient"]): spec = WeaviateClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: - return capabilities() + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl"] + # weaviate names are case sensitive following GraphQL naming convention + # https://weaviate.io/developers/weaviate/config-refs/schema + caps.has_case_sensitive_identifiers = False + # weaviate will upper case first letter of class name and lower case first letter of a property + # we assume that naming convention will do that + caps.casefold_identifier = str + caps.max_identifier_length = 200 + caps.max_column_identifier_length = 1024 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 8 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = False + caps.naming_convention = "dlt.destinations.impl.weaviate.naming" + + return caps @property def client_class(self) -> t.Type["WeaviateClient"]: diff --git a/dlt/destinations/impl/weaviate/naming.py b/dlt/destinations/impl/weaviate/naming.py index f5c94c872f..1e8e73a8e1 100644 --- a/dlt/destinations/impl/weaviate/naming.py +++ b/dlt/destinations/impl/weaviate/naming.py @@ -7,6 +7,10 @@ class NamingConvention(SnakeCaseNamingConvention): """Normalizes identifiers according to Weaviate documentation: https://weaviate.io/developers/weaviate/config-refs/schema#class""" + def __init__(self, max_length: int = None) -> None: + super().__init__(max_length) + self.is_case_sensitive: bool = True + RESERVED_PROPERTIES = {"id": "__id", "_id": "___id", "_additional": "__additional"} _RE_UNDERSCORES = re.compile("([^_])__+") _STARTS_DIGIT = re.compile("^[0-9]") diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py index 2d75ca0809..71f2f13e76 100644 --- a/dlt/destinations/impl/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -31,20 +31,23 @@ from dlt.common.time import ensure_pendulum_datetime from dlt.common.schema import Schema, TTableSchema, TSchemaTables, TTableSchemaColumns from dlt.common.schema.typing import TColumnSchema, TColumnType -from dlt.common.schema.utils import get_columns_names_with_prop +from dlt.common.schema.utils import ( + get_columns_names_with_prop, + loads_table, + normalize_table_identifiers, + version_table, +) from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import TLoadJobState, LoadJob, JobClientBase, WithStateSync -from dlt.common.data_types import TDataType from dlt.common.storages import FileStorage from dlt.destinations.impl.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT - from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.job_client_impl import StorageSchemaInfo, StateInfo -from dlt.destinations.impl.weaviate import capabilities from dlt.destinations.impl.weaviate.configuration import WeaviateClientConfiguration -from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict, WeaviateBatchError +from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict, WeaviateGrpcError from dlt.destinations.type_mapping import TypeMapper +from dlt.destinations.utils import get_pipeline_state_query_columns NON_VECTORIZED_CLASS = { @@ -104,7 +107,7 @@ def _wrap(self: JobClientBase, *args: Any, **kwargs: Any) -> Any: if "conflict for property" in str(status_ex) or "none vectorizer module" in str( status_ex ): - raise PropertyNameConflict() + raise PropertyNameConflict(str(status_ex)) raise DestinationTerminalException(status_ex) # looks like there are no more terminal exception raise DestinationTransientException(status_ex) @@ -115,23 +118,25 @@ def _wrap(self: JobClientBase, *args: Any, **kwargs: Any) -> Any: return _wrap # type: ignore -def wrap_batch_error(f: TFun) -> TFun: +def wrap_grpc_error(f: TFun) -> TFun: @wraps(f) def _wrap(*args: Any, **kwargs: Any) -> Any: try: return f(*args, **kwargs) # those look like terminal exceptions - except WeaviateBatchError as batch_ex: + except WeaviateGrpcError as batch_ex: errors = batch_ex.args[0] message = errors["error"][0]["message"] # TODO: actually put the job in failed/retry state and prepare exception message with full info on failing item if "invalid" in message and "property" in message and "on class" in message: raise DestinationTerminalException( - f"Batch failed {errors} AND WILL **NOT** BE RETRIED" + f"Grpc (batch, query) failed {errors} AND WILL **NOT** BE RETRIED" ) if "conflict for property" in message: - raise PropertyNameConflict() - raise DestinationTransientException(f"Batch failed {errors} AND WILL BE RETRIED") + raise PropertyNameConflict(message) + raise DestinationTransientException( + f"Grpc (batch, query) failed {errors} AND WILL BE RETRIED" + ) except Exception: raise DestinationTransientException("Batch failed AND WILL BE RETRIED") @@ -174,14 +179,14 @@ def load_batch(self, f: IO[str]) -> None: Weaviate batch supports retries so we do not need to do that. """ - @wrap_batch_error + @wrap_grpc_error def check_batch_result(results: List[StrAny]) -> None: """This kills batch on first error reported""" if results is not None: for result in results: if "result" in result and "errors" in result["result"]: if "error" in result["result"]["errors"]: - raise WeaviateBatchError(result["result"]["errors"]) + raise WeaviateGrpcError(result["result"]["errors"]) with self.db_client.batch( batch_size=self.client_config.batch_size, @@ -233,20 +238,25 @@ def exception(self) -> str: class WeaviateClient(JobClientBase, WithStateSync): """Weaviate client implementation.""" - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - state_properties: ClassVar[List[str]] = [ - "version", - "engine_version", - "pipeline_name", - "state", - "created_at", - "_dlt_load_id", - ] - - def __init__(self, schema: Schema, config: WeaviateClientConfiguration) -> None: - super().__init__(schema, config) + def __init__( + self, + schema: Schema, + config: WeaviateClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(schema, config, capabilities) + # get definitions of the dlt tables, normalize column names and keep for later use + version_table_ = normalize_table_identifiers(version_table(), schema.naming) + self.version_collection_properties = list(version_table_["columns"].keys()) + loads_table_ = normalize_table_identifiers(loads_table(), schema.naming) + self.loads_collection_properties = list(loads_table_["columns"].keys()) + state_table_ = normalize_table_identifiers( + get_pipeline_state_query_columns(), schema.naming + ) + self.pipeline_state_properties = list(state_table_["columns"].keys()) + self.config: WeaviateClientConfiguration = config - self.db_client = self.create_db_client(config) + self.db_client: weaviate.Client = None self._vectorizer_config = { "vectorizer": config.vectorizer, @@ -451,15 +461,23 @@ def update_stored_schema( return applied_update def _execute_schema_update(self, only_tables: Iterable[str]) -> None: - for table_name in only_tables or self.schema.tables: + for table_name in only_tables or self.schema.tables.keys(): exists, existing_columns = self.get_storage_table(table_name) # TODO: detect columns where vectorization was added or removed and modify it. currently we ignore change of hints - new_columns = self.schema.get_new_table_columns(table_name, existing_columns) + new_columns = self.schema.get_new_table_columns( + table_name, + existing_columns, + case_sensitive=self.capabilities.has_case_sensitive_identifiers + and self.capabilities.casefold_identifier is str, + ) logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}") if len(new_columns) > 0: if exists: + is_collection_vectorized = self._is_collection_vectorized(table_name) for column in new_columns: - prop = self._make_property_schema(column["name"], column) + prop = self._make_property_schema( + column["name"], column, is_collection_vectorized + ) self.create_class_property(table_name, prop) else: class_schema = self.make_weaviate_class_schema(table_name) @@ -487,6 +505,11 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns] def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: """Loads compressed state from destination storage""" + # normalize properties + p_load_id = self.schema.naming.normalize_identifier("load_id") + p_dlt_load_id = self.schema.naming.normalize_identifier("_dlt_load_id") + p_pipeline_name = self.schema.naming.normalize_identifier("pipeline_name") + p_status = self.schema.naming.normalize_identifier("status") # we need to find a stored state that matches a load id that was completed # we retrieve the state in blocks of 10 for this @@ -496,44 +519,45 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: state_records = self.get_records( self.schema.state_table_name, # search by package load id which is guaranteed to increase over time - sort={"path": ["_dlt_load_id"], "order": "desc"}, + sort={"path": [p_dlt_load_id], "order": "desc"}, where={ - "path": ["pipeline_name"], + "path": [p_pipeline_name], "operator": "Equal", "valueString": pipeline_name, }, limit=stepsize, offset=offset, - properties=self.state_properties, + properties=self.pipeline_state_properties, ) offset += stepsize if len(state_records) == 0: return None for state in state_records: - load_id = state["_dlt_load_id"] + load_id = state[p_dlt_load_id] load_records = self.get_records( self.schema.loads_table_name, where={ - "path": ["load_id"], + "path": [p_load_id], "operator": "Equal", "valueString": load_id, }, limit=1, - properties=["load_id", "status"], + properties=[p_load_id, p_status], ) # if there is a load for this state which was successful, return the state if len(load_records): - state["dlt_load_id"] = state.pop("_dlt_load_id") return StateInfo(**state) def get_stored_schema(self) -> Optional[StorageSchemaInfo]: """Retrieves newest schema from destination storage""" + p_schema_name = self.schema.naming.normalize_identifier("schema_name") + p_inserted_at = self.schema.naming.normalize_identifier("inserted_at") try: record = self.get_records( self.schema.version_table_name, - sort={"path": ["inserted_at"], "order": "desc"}, + sort={"path": [p_inserted_at], "order": "desc"}, where={ - "path": ["schema_name"], + "path": [p_schema_name], "operator": "Equal", "valueString": self.schema.name, }, @@ -544,11 +568,12 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]: return None def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]: + p_version_hash = self.schema.naming.normalize_identifier("version_hash") try: record = self.get_records( self.schema.version_table_name, where={ - "path": ["version_hash"], + "path": [p_version_hash], "operator": "Equal", "valueString": schema_hash, }, @@ -585,8 +610,13 @@ def get_records( query = query.with_offset(offset) response = query.do() + # if json rpc is used, weaviate does not raise exceptions + if "errors" in response: + raise WeaviateGrpcError(response["errors"]) full_class_name = self.make_qualified_class_name(table_name) records = response["data"]["Get"][full_class_name] + if records is None: + raise DestinationTransientException(f"Could not obtain records for {full_class_name}") return cast(List[Dict[str, Any]], records) def make_weaviate_class_schema(self, table_name: str) -> Dict[str, Any]: @@ -597,31 +627,39 @@ def make_weaviate_class_schema(self, table_name: str) -> Dict[str, Any]: } # check if any column requires vectorization - if get_columns_names_with_prop(self.schema.get_table(table_name), VECTORIZE_HINT): + if self._is_collection_vectorized(table_name): class_schema.update(self._vectorizer_config) else: class_schema.update(NON_VECTORIZED_CLASS) return class_schema + def _is_collection_vectorized(self, table_name: str) -> bool: + """Tells is any of the columns has vectorize hint set""" + return ( + len(get_columns_names_with_prop(self.schema.get_table(table_name), VECTORIZE_HINT)) > 0 + ) + def _make_properties(self, table_name: str) -> List[Dict[str, Any]]: """Creates a Weaviate properties schema from a table schema. Args: table: The table name for which columns should be converted to properties """ - + is_collection_vectorized = self._is_collection_vectorized(table_name) return [ - self._make_property_schema(column_name, column) + self._make_property_schema(column_name, column, is_collection_vectorized) for column_name, column in self.schema.get_table_columns(table_name).items() ] - def _make_property_schema(self, column_name: str, column: TColumnSchema) -> Dict[str, Any]: + def _make_property_schema( + self, column_name: str, column: TColumnSchema, is_collection_vectorized: bool + ) -> Dict[str, Any]: extra_kv = {} vectorizer_name = self._vectorizer_config["vectorizer"] # x-weaviate-vectorize: (bool) means that this field should be vectorized - if not column.get(VECTORIZE_HINT, False): + if is_collection_vectorized and not column.get(VECTORIZE_HINT, False): # tell weaviate explicitly to not vectorize when column has no vectorize hint extra_kv["moduleConfig"] = { vectorizer_name: { @@ -655,15 +693,20 @@ def restore_file_load(self, file_path: str) -> LoadJob: @wrap_weaviate_error def complete_load(self, load_id: str) -> None: - properties = { - "load_id": load_id, - "schema_name": self.schema.name, - "status": 0, - "inserted_at": pendulum.now().isoformat(), - } + # corresponds to order of the columns in loads_table() + values = [ + load_id, + self.schema.name, + 0, + pendulum.now().isoformat(), + self.schema.version_hash, + ] + assert len(values) == len(self.loads_collection_properties) + properties = {k: v for k, v in zip(self.loads_collection_properties, values)} self.create_object(properties, self.schema.loads_table_name) def __enter__(self) -> "WeaviateClient": + self.db_client = self.create_db_client(self.config) return self def __exit__( @@ -672,18 +715,22 @@ def __exit__( exc_val: BaseException, exc_tb: TracebackType, ) -> None: - pass + if self.db_client: + self.db_client = None def _update_schema_in_storage(self, schema: Schema) -> None: schema_str = json.dumps(schema.to_dict()) - properties = { - "version_hash": schema.stored_version_hash, - "schema_name": schema.name, - "version": schema.version, - "engine_version": schema.ENGINE_VERSION, - "inserted_at": pendulum.now().isoformat(), - "schema": schema_str, - } + # corresponds to order of the columns in version_table() + values = [ + schema.version, + schema.ENGINE_VERSION, + str(pendulum.now().isoformat()), + schema.name, + schema.stored_version_hash, + schema_str, + ] + assert len(values) == len(self.version_collection_properties) + properties = {k: v for k, v in zip(self.version_collection_properties, values)} self.create_object(properties, self.schema.version_table_name) def _from_db_type( diff --git a/dlt/destinations/insert_job_client.py b/dlt/destinations/insert_job_client.py index 74e14f0221..652d13f556 100644 --- a/dlt/destinations/insert_job_client.py +++ b/dlt/destinations/insert_job_client.py @@ -36,6 +36,10 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st # the procedure below will split the inserts into max_query_length // 2 packs with FileStorage.open_zipsafe_ro(file_path, "r", encoding="utf-8") as f: header = f.readline() + # format and casefold header + header = self._sql_client.capabilities.casefold_identifier(header).format( + qualified_table_name + ) writer_type = self._sql_client.capabilities.insert_values_writer_type if writer_type == "default": sep = "," @@ -70,7 +74,7 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st # Chunk by max_rows - 1 for simplicity because one more row may be added for chunk in chunks(values_rows, max_rows - 1): processed += len(chunk) - insert_sql.append(header.format(qualified_table_name)) + insert_sql.append(header) if writer_type == "default": insert_sql.append(values_mark) if processed == len_rows: @@ -82,11 +86,9 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st else: # otherwise write all content in a single INSERT INTO if writer_type == "default": - insert_sql.extend( - [header.format(qualified_table_name), values_mark, content + until_nl] - ) + insert_sql.extend([header, values_mark, content + until_nl]) elif writer_type == "select_union": - insert_sql.extend([header.format(qualified_table_name), content + until_nl]) + insert_sql.extend([header, content + until_nl]) # actually this may be empty if we were able to read a full file into content if not is_eof: diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index ac3636db2b..0a627bbdfb 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -1,44 +1,39 @@ import os from abc import abstractmethod import base64 -import binascii import contextlib from copy import copy -import datetime # noqa: 251 from types import TracebackType from typing import ( Any, - ClassVar, List, - NamedTuple, Optional, Sequence, Tuple, Type, Iterable, Iterator, - ContextManager, - cast, ) import zlib import re -from dlt.common import logger +from dlt.common import pendulum, logger from dlt.common.json import json -from dlt.common.pendulum import pendulum -from dlt.common.data_types import TDataType from dlt.common.schema.typing import ( COLUMN_HINTS, TColumnType, TColumnSchemaBase, TTableSchema, - TWriteDisposition, TTableFormat, ) +from dlt.common.schema.utils import ( + loads_table, + normalize_table_identifiers, + version_table, +) from dlt.common.storages import FileStorage from dlt.common.storages.load_package import LoadJobInfo from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables -from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME from dlt.common.destination.reference import ( StateInfo, StorageSchemaInfo, @@ -59,6 +54,11 @@ from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob from dlt.destinations.typing import TNativeConn from dlt.destinations.sql_client import SqlClientBase +from dlt.destinations.utils import ( + get_pipeline_state_query_columns, + info_schema_null_to_bool, + verify_sql_job_client_schema, +) # this should suffice for now DDL_COMMANDS = ["ALTER", "CREATE", "DROP"] @@ -78,7 +78,7 @@ def __init__(self, file_path: str, sql_client: SqlClientBase[Any]) -> None: sql_client.execute_many(self._split_fragments(sql)) # if we detect ddl transactions, only execute transaction if supported by client elif ( - not self._string_containts_ddl_queries(sql) + not self._string_contains_ddl_queries(sql) or sql_client.capabilities.supports_ddl_transactions ): # with sql_client.begin_transaction(): @@ -95,7 +95,7 @@ def exception(self) -> str: # this part of code should be never reached raise NotImplementedError() - def _string_containts_ddl_queries(self, sql: str) -> bool: + def _string_contains_ddl_queries(self, sql: str) -> bool: for cmd in DDL_COMMANDS: if re.search(cmd, sql, re.IGNORECASE): return True @@ -133,37 +133,28 @@ def state(self) -> TLoadJobState: class SqlJobClientBase(JobClientBase, WithStateSync): - _VERSION_TABLE_SCHEMA_COLUMNS: ClassVar[Tuple[str, ...]] = ( - "version_hash", - "schema_name", - "version", - "engine_version", - "inserted_at", - "schema", - ) - _STATE_TABLE_COLUMNS: ClassVar[Tuple[str, ...]] = ( - "version", - "engine_version", - "pipeline_name", - "state", - "created_at", - "_dlt_load_id", - ) - def __init__( self, schema: Schema, config: DestinationClientConfiguration, sql_client: SqlClientBase[TNativeConn], ) -> None: + # get definitions of the dlt tables, normalize column names and keep for later use + version_table_ = normalize_table_identifiers(version_table(), schema.naming) self.version_table_schema_columns = ", ".join( - sql_client.escape_column_name(col) for col in self._VERSION_TABLE_SCHEMA_COLUMNS + sql_client.escape_column_name(col) for col in version_table_["columns"] + ) + loads_table_ = normalize_table_identifiers(loads_table(), schema.naming) + self.loads_table_schema_columns = ", ".join( + sql_client.escape_column_name(col) for col in loads_table_["columns"] + ) + state_table_ = normalize_table_identifiers( + get_pipeline_state_query_columns(), schema.naming ) self.state_table_columns = ", ".join( - sql_client.escape_column_name(col) for col in self._STATE_TABLE_COLUMNS + sql_client.escape_column_name(col) for col in state_table_["columns"] ) - - super().__init__(schema, config) + super().__init__(schema, config, sql_client.capabilities) self.sql_client = sql_client assert isinstance(config, DestinationClientDwhConfiguration) self.config: DestinationClientDwhConfiguration = config @@ -250,10 +241,12 @@ def _create_replace_followup_jobs( def create_table_chain_completed_followup_jobs( self, table_chain: Sequence[TTableSchema], - table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, + completed_table_chain_jobs: Optional[Sequence[LoadJobInfo]] = None, ) -> List[NewLoadJob]: """Creates a list of followup jobs for merge write disposition and staging replace strategies""" - jobs = super().create_table_chain_completed_followup_jobs(table_chain, table_chain_jobs) + jobs = super().create_table_chain_completed_followup_jobs( + table_chain, completed_table_chain_jobs + ) write_disposition = table_chain[0]["write_disposition"] if write_disposition == "append": jobs.extend(self._create_append_followup_jobs(table_chain)) @@ -290,8 +283,7 @@ def complete_load(self, load_id: str) -> None: name = self.sql_client.make_qualified_table_name(self.schema.loads_table_name) now_ts = pendulum.now() self.sql_client.execute_sql( - f"INSERT INTO {name}(load_id, schema_name, status, inserted_at, schema_version_hash)" - " VALUES(%s, %s, %s, %s, %s);", + f"INSERT INTO {name}({self.loads_table_schema_columns}) VALUES(%s, %s, %s, %s, %s);", load_id, self.schema.name, 0, @@ -308,54 +300,84 @@ def __exit__( ) -> None: self.sql_client.close_connection() - def _get_storage_table_query_columns(self) -> List[str]: - """Column names used when querying table from information schema. - Override for databases that use different namings. - """ - fields = ["column_name", "data_type", "is_nullable"] - if self.capabilities.schema_supports_numeric_precision: - fields += ["numeric_precision", "numeric_scale"] - return fields + def get_storage_tables( + self, table_names: Iterable[str] + ) -> Iterable[Tuple[str, TTableSchemaColumns]]: + """Uses INFORMATION_SCHEMA to retrieve table and column information for tables in `table_names` iterator. + Table names should be normalized according to naming convention and will be further converted to desired casing + in order to (in most cases) create case-insensitive name suitable for search in information schema. - def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: - def _null_to_bool(v: str) -> bool: - if v == "NO": - return False - elif v == "YES": - return True - raise ValueError(v) + The column names are returned as in information schema. To match those with columns in existing table, you'll need to use + `schema.get_new_table_columns` method and pass the correct casing. Most of the casing function are irreversible so it is not + possible to convert identifiers into INFORMATION SCHEMA back into case sensitive dlt schema. + """ + table_names = list(table_names) + if len(table_names) == 0: + # empty generator + return + # get schema search components + catalog_name, schema_name, folded_table_names = ( + self.sql_client._get_information_schema_components(*table_names) + ) + # create table name conversion lookup table + name_lookup = { + folded_name: name for folded_name, name in zip(folded_table_names, table_names) + } + # this should never happen: we verify schema for name collisions before loading + assert len(name_lookup) == len(table_names), ( + f"One or more of tables in {table_names} after applying" + f" {self.capabilities.casefold_identifier} produced a name collision." + ) - fields = self._get_storage_table_query_columns() - db_params = self.sql_client.make_qualified_table_name(table_name, escape=False).split( - ".", 3 + # rows = self.sql_client.execute_sql(query, *db_params) + query, db_params = self._get_info_schema_columns_query( + catalog_name, schema_name, folded_table_names ) - query = f""" -SELECT {",".join(fields)} - FROM INFORMATION_SCHEMA.COLUMNS -WHERE """ - if len(db_params) == 3: - query += "table_catalog = %s AND " - query += "table_schema = %s AND table_name = %s ORDER BY ordinal_position;" rows = self.sql_client.execute_sql(query, *db_params) - - # if no rows we assume that table does not exist - schema_table: TTableSchemaColumns = {} - if len(rows) == 0: - # TODO: additionally check if table exists - return False, schema_table - # TODO: pull more data to infer indexes, PK and uniques attributes/constraints + prev_table: str = None + storage_columns: TTableSchemaColumns = None for c in rows: + # make sure that new table is known + assert ( + c[0] in name_lookup + ), f"Table name {c[0]} not in expected tables {name_lookup.keys()}" + table_name = name_lookup[c[0]] + if prev_table != table_name: + # yield what we have + if storage_columns: + yield (prev_table, storage_columns) + # we have new table + storage_columns = {} + prev_table = table_name + # remove from table_names + table_names.remove(prev_table) + # add columns + col_name = c[1] numeric_precision = ( - c[3] if self.capabilities.schema_supports_numeric_precision else None + c[4] if self.capabilities.schema_supports_numeric_precision else None ) - numeric_scale = c[4] if self.capabilities.schema_supports_numeric_precision else None + numeric_scale = c[5] if self.capabilities.schema_supports_numeric_precision else None + schema_c: TColumnSchemaBase = { - "name": c[0], - "nullable": _null_to_bool(c[2]), - **self._from_db_type(c[1], numeric_precision, numeric_scale), + "name": col_name, + "nullable": info_schema_null_to_bool(c[3]), + **self._from_db_type(c[2], numeric_precision, numeric_scale), } - schema_table[c[0]] = schema_c # type: ignore - return True, schema_table + storage_columns[col_name] = schema_c # type: ignore + # yield last table, it must have at least one column or we had no rows + if storage_columns: + yield (prev_table, storage_columns) + # if no columns we assume that table does not exist + for table_name in table_names: + yield (table_name, {}) + + def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: + """Uses get_storage_tables to get single `table_name` schema. + + Returns (True, ...) if table exists and (False, {}) when not + """ + storage_table = list(self.get_storage_tables([table_name]))[0] + return len(storage_table[1]) > 0, storage_table[1] @abstractmethod def _from_db_type( @@ -365,31 +387,90 @@ def _from_db_type( def get_stored_schema(self) -> StorageSchemaInfo: name = self.sql_client.make_qualified_table_name(self.schema.version_table_name) + c_schema_name, c_inserted_at = self._norm_and_escape_columns("schema_name", "inserted_at") query = ( - f"SELECT {self.version_table_schema_columns} FROM {name} WHERE schema_name = %s ORDER" - " BY inserted_at DESC;" + f"SELECT {self.version_table_schema_columns} FROM {name} WHERE {c_schema_name} = %s" + f" ORDER BY {c_inserted_at} DESC;" ) return self._row_to_schema_info(query, self.schema.name) def get_stored_state(self, pipeline_name: str) -> StateInfo: state_table = self.sql_client.make_qualified_table_name(self.schema.state_table_name) loads_table = self.sql_client.make_qualified_table_name(self.schema.loads_table_name) + c_load_id, c_dlt_load_id, c_pipeline_name, c_status = self._norm_and_escape_columns( + "load_id", "_dlt_load_id", "pipeline_name", "status" + ) query = ( f"SELECT {self.state_table_columns} FROM {state_table} AS s JOIN {loads_table} AS l ON" - " l.load_id = s._dlt_load_id WHERE pipeline_name = %s AND l.status = 0 ORDER BY" - " l.load_id DESC" + f" l.{c_load_id} = s.{c_dlt_load_id} WHERE {c_pipeline_name} = %s AND l.{c_status} = 0" + f" ORDER BY {c_load_id} DESC" ) with self.sql_client.execute_query(query, pipeline_name) as cur: row = cur.fetchone() if not row: return None - return StateInfo(row[0], row[1], row[2], row[3], pendulum.instance(row[4])) + # NOTE: we request order of columns in SELECT statement which corresponds to StateInfo + return StateInfo( + version=row[0], + engine_version=row[1], + pipeline_name=row[2], + state=row[3], + created_at=pendulum.instance(row[4]), + _dlt_load_id=row[5], + ) + + def _norm_and_escape_columns(self, *columns: str) -> Iterator[str]: + return map( + self.sql_client.escape_column_name, map(self.schema.naming.normalize_path, columns) + ) def get_stored_schema_by_hash(self, version_hash: str) -> StorageSchemaInfo: - name = self.sql_client.make_qualified_table_name(self.schema.version_table_name) - query = f"SELECT {self.version_table_schema_columns} FROM {name} WHERE version_hash = %s;" + table_name = self.sql_client.make_qualified_table_name(self.schema.version_table_name) + (c_version_hash,) = self._norm_and_escape_columns("version_hash") + query = ( + f"SELECT {self.version_table_schema_columns} FROM {table_name} WHERE" + f" {c_version_hash} = %s;" + ) return self._row_to_schema_info(query, version_hash) + def _get_info_schema_columns_query( + self, catalog_name: Optional[str], schema_name: str, folded_table_names: List[str] + ) -> Tuple[str, List[Any]]: + """Generates SQL to query INFORMATION_SCHEMA.COLUMNS for a set of tables in `folded_table_names`. Input identifiers must be already + in a form that can be passed to a query via db_params. `catalogue_name` is optional and when None, the part of query selecting it + is skipped. + + Returns: query and list of db_params tuple + """ + query = f""" +SELECT {",".join(self._get_storage_table_query_columns())} + FROM INFORMATION_SCHEMA.COLUMNS +WHERE """ + + db_params = [] + if catalog_name: + db_params.append(catalog_name) + query += "table_catalog = %s AND " + db_params.append(schema_name) + db_params = db_params + folded_table_names + # placeholder for each table + table_placeholders = ",".join(["%s"] * len(folded_table_names)) + query += ( + f"table_schema = %s AND table_name IN ({table_placeholders}) ORDER BY table_name," + " ordinal_position;" + ) + + return query, db_params + + def _get_storage_table_query_columns(self) -> List[str]: + """Column names used when querying table from information schema. + Override for databases that use different namings. + """ + fields = ["table_name", "column_name", "data_type", "is_nullable"] + if self.capabilities.schema_supports_numeric_precision: + fields += ["numeric_precision", "numeric_scale"] + return fields + def _execute_schema_update_sql(self, only_tables: Iterable[str]) -> TSchemaTables: sql_scripts, schema_update = self._build_schema_update_sql(only_tables) # Stay within max query size when doing DDL. @@ -416,12 +497,16 @@ def _build_schema_update_sql( """ sql_updates = [] schema_update: TSchemaTables = {} - for table_name in only_tables or self.schema.tables: - exists, storage_table = self.get_storage_table(table_name) - new_columns = self._create_table_update(table_name, storage_table) + for table_name, storage_columns in self.get_storage_tables( + only_tables or self.schema.tables.keys() + ): + # this will skip incomplete columns + new_columns = self._create_table_update(table_name, storage_columns) if len(new_columns) > 0: # build and add sql to execute - sql_statements = self._get_table_update_sql(table_name, new_columns, exists) + sql_statements = self._get_table_update_sql( + table_name, new_columns, len(storage_columns) > 0 + ) for sql in sql_statements: if not sql.endswith(";"): sql += ";" @@ -472,7 +557,7 @@ def _get_table_update_sql( for hint in COLUMN_HINTS: if any(c.get(hint, False) is True for c in new_columns): hint_columns = [ - self.capabilities.escape_identifier(c["name"]) + self.sql_client.escape_column_name(c["name"]) for c in new_columns if c.get(hint, False) ] @@ -501,8 +586,13 @@ def _gen_not_null(v: bool) -> str: def _create_table_update( self, table_name: str, storage_columns: TTableSchemaColumns ) -> Sequence[TColumnSchema]: - # compare table with stored schema and produce delta - updates = self.schema.get_new_table_columns(table_name, storage_columns) + """Compares storage columns with schema table and produce delta columns difference""" + updates = self.schema.get_new_table_columns( + table_name, + storage_columns, + case_sensitive=self.capabilities.has_case_sensitive_identifiers + and self.capabilities.casefold_identifier is str, + ) logger.info(f"Found {len(updates)} updates for {table_name} in {self.schema.name}") return updates @@ -526,16 +616,17 @@ def _row_to_schema_info(self, query: str, *args: Any) -> StorageSchemaInfo: pass # make utc datetime - inserted_at = pendulum.instance(row[4]) + inserted_at = pendulum.instance(row[2]) - return StorageSchemaInfo(row[0], row[1], row[2], row[3], inserted_at, schema_str) + return StorageSchemaInfo(row[4], row[3], row[0], row[1], inserted_at, schema_str) def _delete_schema_in_storage(self, schema: Schema) -> None: """ Delete all stored versions with the same name as given schema """ name = self.sql_client.make_qualified_table_name(self.schema.version_table_name) - self.sql_client.execute_sql(f"DELETE FROM {name} WHERE schema_name = %s;", schema.name) + (c_schema_name,) = self._norm_and_escape_columns("schema_name") + self.sql_client.execute_sql(f"DELETE FROM {name} WHERE {c_schema_name} = %s;", schema.name) def _update_schema_in_storage(self, schema: Schema) -> None: # get schema string or zip @@ -554,14 +645,21 @@ def _commit_schema_update(self, schema: Schema, schema_str: str) -> None: self.sql_client.execute_sql( f"INSERT INTO {name}({self.version_table_schema_columns}) VALUES (%s, %s, %s, %s, %s," " %s);", - schema.stored_version_hash, - schema.name, schema.version, schema.ENGINE_VERSION, now_ts, + schema.name, + schema.stored_version_hash, schema_str, ) + def _verify_schema(self) -> None: + super()._verify_schema() + if exceptions := verify_sql_job_client_schema(self.schema, warnings=True): + for exception in exceptions: + logger.error(str(exception)) + raise exceptions[0] + class SqlJobClientWithStaging(SqlJobClientBase, WithStagingDataset): in_staging_mode: bool = False diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py index 9b73d7d28c..7912ac4561 100644 --- a/dlt/destinations/sql_client.py +++ b/dlt/destinations/sql_client.py @@ -30,13 +30,15 @@ class SqlClientBase(ABC, Generic[TNativeConn]): dbapi: ClassVar[DBApi] = None - capabilities: ClassVar[DestinationCapabilitiesContext] = None - def __init__(self, database_name: str, dataset_name: str) -> None: + def __init__( + self, database_name: str, dataset_name: str, capabilities: DestinationCapabilitiesContext + ) -> None: if not dataset_name: raise ValueError(dataset_name) self.dataset_name = dataset_name self.database_name = database_name + self.capabilities = capabilities @abstractmethod def open_connection(self) -> TNativeConn: @@ -75,9 +77,12 @@ def has_dataset(self) -> bool: SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE """ - db_params = self.fully_qualified_dataset_name(escape=False).split(".", 2) - if len(db_params) == 2: + catalog_name, schema_name, _ = self._get_information_schema_components() + db_params: List[str] = [] + if catalog_name is not None: query += " catalog_name = %s AND " + db_params.append(catalog_name) + db_params.append(schema_name) query += "schema_name = %s" rows = self.execute_sql(query, *db_params) return len(rows) > 0 @@ -137,16 +142,39 @@ def execute_many( ret.append(result) return ret - @abstractmethod + def catalog_name(self, escape: bool = True) -> Optional[str]: + # default is no catalogue component of the name, which typically means that + # connection is scoped to a current database + return None + def fully_qualified_dataset_name(self, escape: bool = True) -> str: - pass + return ".".join(self.make_qualified_table_name_path(None, escape=escape)) def make_qualified_table_name(self, table_name: str, escape: bool = True) -> str: + return ".".join(self.make_qualified_table_name_path(table_name, escape=escape)) + + def make_qualified_table_name_path( + self, table_name: Optional[str], escape: bool = True + ) -> List[str]: + """Returns a list with path components leading from catalog to table_name. + Used to construct fully qualified names. `table_name` is optional. + """ + path: List[str] = [] + if catalog_name := self.catalog_name(escape=escape): + path.append(catalog_name) + dataset_name = self.capabilities.casefold_identifier(self.dataset_name) if escape: - table_name = self.capabilities.escape_identifier(table_name) - return f"{self.fully_qualified_dataset_name(escape=escape)}.{table_name}" + dataset_name = self.capabilities.escape_identifier(dataset_name) + path.append(dataset_name) + if table_name: + table_name = self.capabilities.casefold_identifier(table_name) + if escape: + table_name = self.capabilities.escape_identifier(table_name) + path.append(table_name) + return path def escape_column_name(self, column_name: str, escape: bool = True) -> str: + column_name = self.capabilities.casefold_identifier(column_name) if escape: return self.capabilities.escape_identifier(column_name) return column_name @@ -191,6 +219,18 @@ def is_dbapi_exception(ex: Exception) -> bool: def make_staging_dataset_name(dataset_name: str) -> str: return dataset_name + "_staging" + def _get_information_schema_components(self, *tables: str) -> Tuple[str, str, List[str]]: + """Gets catalog name, schema name and name of the tables in format that can be directly + used to query INFORMATION_SCHEMA. catalog name is optional: in that case None is + returned in the first element of the tuple. + """ + schema_path = self.make_qualified_table_name_path(None, escape=False) + return ( + self.catalog_name(escape=False), + schema_path[-1], + [self.make_qualified_table_name_path(table, escape=False)[-1] for table in tables], + ) + # # generate sql statements # @@ -220,6 +260,11 @@ def _get_columns(self) -> List[str]: return [c[0] for c in self.native_cursor.description] def df(self, chunk_size: int = None, **kwargs: Any) -> Optional[DataFrame]: + """Fetches results as data frame in full or in specified chunks. + + May use native pandas/arrow reader if available. Depending on + the native implementation chunk size may vary. + """ from dlt.common.libs.pandas_sql import _wrap_result columns = self._get_columns() diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 4f8e29ae0d..b9539fe114 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -117,7 +117,7 @@ def _generate_insert_sql( table_name = sql_client.make_qualified_table_name(table["name"]) columns = ", ".join( map( - sql_client.capabilities.escape_identifier, + sql_client.escape_column_name, get_columns_names_with_prop(table, "name"), ) ) @@ -361,10 +361,8 @@ def gen_merge_sql( sql: List[str] = [] root_table = table_chain[0] - escape_id = sql_client.capabilities.escape_identifier + escape_column_id = sql_client.escape_column_name escape_lit = sql_client.capabilities.escape_literal - if escape_id is None: - escape_id = DestinationCapabilitiesContext.generic_capabilities().escape_identifier if escape_lit is None: escape_lit = DestinationCapabilitiesContext.generic_capabilities().escape_literal @@ -376,13 +374,13 @@ def gen_merge_sql( # get merge and primary keys from top level primary_keys = list( map( - escape_id, + escape_column_id, get_columns_names_with_prop(root_table, "primary_key"), ) ) merge_keys = list( map( - escape_id, + escape_column_id, get_columns_names_with_prop(root_table, "merge_key"), ) ) @@ -419,7 +417,7 @@ def gen_merge_sql( f" {root_table['name']} so it is not possible to link child tables to it.", ) # get first unique column - unique_column = escape_id(unique_columns[0]) + unique_column = escape_column_id(unique_columns[0]) # create temp table with unique identifier create_delete_temp_table_sql, delete_temp_table_name = ( cls.gen_delete_temp_table_sql( @@ -442,14 +440,14 @@ def gen_merge_sql( f" {table['name']} so it is not possible to refer to top level table" f" {root_table['name']} unique column {unique_column}", ) - root_key_column = escape_id(root_key_columns[0]) + root_key_column = escape_column_id(root_key_columns[0]) sql.append( cls.gen_delete_from_sql( table_name, root_key_column, delete_temp_table_name, unique_column ) ) - # delete from top table now that child tables have been prcessed + # delete from top table now that child tables have been processed sql.append( cls.gen_delete_from_sql( root_table_name, unique_column, delete_temp_table_name, unique_column @@ -461,10 +459,10 @@ def gen_merge_sql( hard_delete_col = get_first_column_name_with_prop(root_table, "hard_delete") if hard_delete_col is not None: # any value indicates a delete for non-boolean columns - not_deleted_cond = f"{escape_id(hard_delete_col)} IS NULL" + not_deleted_cond = f"{escape_column_id(hard_delete_col)} IS NULL" if root_table["columns"][hard_delete_col]["data_type"] == "bool": # only True values indicate a delete for boolean columns - not_deleted_cond += f" OR {escape_id(hard_delete_col)} = {escape_lit(False)}" + not_deleted_cond += f" OR {escape_column_id(hard_delete_col)} = {escape_lit(False)}" # get dedup sort information dedup_sort = get_dedup_sort_tuple(root_table) @@ -503,7 +501,7 @@ def gen_merge_sql( uniq_column = unique_column if table.get("parent") is None else root_key_column insert_cond = f"{uniq_column} IN (SELECT * FROM {insert_temp_table_name})" - columns = list(map(escape_id, get_columns_names_with_prop(table, "name"))) + columns = list(map(escape_column_id, get_columns_names_with_prop(table, "name"))) col_str = ", ".join(columns) select_sql = f"SELECT {col_str} FROM {staging_table_name} WHERE {insert_cond}" if len(primary_keys) > 0 and len(table_chain) == 1: @@ -534,9 +532,11 @@ def gen_scd2_sql( # get column names caps = sql_client.capabilities - escape_id = caps.escape_identifier - from_, to = list(map(escape_id, get_validity_column_names(root_table))) # validity columns - hash_ = escape_id( + escape_column_id = sql_client.escape_column_name + from_, to = list( + map(escape_column_id, get_validity_column_names(root_table)) + ) # validity columns + hash_ = escape_column_id( get_first_column_name_with_prop(root_table, "x-row-version") ) # row hash column @@ -568,7 +568,7 @@ def gen_scd2_sql( """) # insert new active records in root table - columns = map(escape_id, list(root_table["columns"].keys())) + columns = map(escape_column_id, list(root_table["columns"].keys())) col_str = ", ".join([c for c in columns if c not in (from_, to)]) sql.append(f""" INSERT INTO {root_table_name} ({col_str}, {from_}, {to}) @@ -592,7 +592,7 @@ def gen_scd2_sql( " it is not possible to link child tables to it.", ) # get first unique column - unique_column = escape_id(unique_columns[0]) + unique_column = escape_column_id(unique_columns[0]) # TODO: - based on deterministic child hashes (OK) # - if row hash changes all is right # - if it does not we only capture new records, while we should replace existing with those in stage diff --git a/dlt/destinations/utils.py b/dlt/destinations/utils.py index c02460fe58..d24ad7c5a7 100644 --- a/dlt/destinations/utils.py +++ b/dlt/destinations/utils.py @@ -1,9 +1,23 @@ import re +from typing import Any, List, Optional, Tuple + +from dlt.common import logger +from dlt.common.schema import Schema +from dlt.common.schema.exceptions import SchemaCorruptedException +from dlt.common.schema.typing import MERGE_STRATEGIES, TTableSchema +from dlt.common.schema.utils import ( + get_columns_names_with_prop, + get_first_column_name_with_prop, + has_column_with_prop, + pipeline_state_table, +) from typing import Any, cast, Tuple, Dict, Type from dlt.destinations.exceptions import DatabaseTransientException from dlt.extract import DltResource, resource as make_resource +RE_DATA_TYPE = re.compile(r"([A-Z]+)\((\d+)(?:,\s?(\d+))?\)") + def ensure_resource(data: Any) -> DltResource: """Wraps `data` in a DltResource if it's not a DltResource already.""" @@ -13,6 +27,119 @@ def ensure_resource(data: Any) -> DltResource: return cast(DltResource, make_resource(data, name=resource_name)) +def info_schema_null_to_bool(v: str) -> bool: + """Converts INFORMATION SCHEMA truth values to Python bool""" + if v in ("NO", "0"): + return False + elif v in ("YES", "1"): + return True + raise ValueError(v) + + +def parse_db_data_type_str_with_precision(db_type: str) -> Tuple[str, Optional[int], Optional[int]]: + """Parses a db data type with optional precision or precision and scale information""" + # Search for matches using the regular expression + match = RE_DATA_TYPE.match(db_type) + + # If the pattern matches, extract the type, precision, and scale + if match: + db_type = match.group(1) + precision = int(match.group(2)) + scale = int(match.group(3)) if match.group(3) else None + return db_type, precision, scale + + # If the pattern does not match, return the original type without precision and scale + return db_type, None, None + + +def get_pipeline_state_query_columns() -> TTableSchema: + """We get definition of pipeline state table without columns we do not need for the query""" + state_table = pipeline_state_table() + # we do not need version_hash to be backward compatible as long as we can + state_table["columns"].pop("version_hash") + return state_table + + +def verify_sql_job_client_schema(schema: Schema, warnings: bool = True) -> List[Exception]: + log = logger.warning if warnings else logger.info + # collect all exceptions to show all problems in the schema + exception_log: List[Exception] = [] + + # verifies schema settings specific to sql job client + for table in schema.data_tables(): + table_name = table["name"] + if table.get("write_disposition") == "merge": + if "x-merge-strategy" in table and table["x-merge-strategy"] not in MERGE_STRATEGIES: # type: ignore[typeddict-item] + exception_log.append( + SchemaCorruptedException( + schema.name, + f'"{table["x-merge-strategy"]}" is not a valid merge strategy. ' # type: ignore[typeddict-item] + f"""Allowed values: {', '.join(['"' + s + '"' for s in MERGE_STRATEGIES])}.""", + ) + ) + if ( + table.get("x-merge-strategy") == "delete-insert" + and not has_column_with_prop(table, "primary_key") + and not has_column_with_prop(table, "merge_key") + ): + log( + f"Table {table_name} has `write_disposition` set to `merge`" + " and `merge_strategy` set to `delete-insert`, but no primary or" + " merge keys defined." + " dlt will fall back to `append` for this table." + ) + if has_column_with_prop(table, "hard_delete"): + if len(get_columns_names_with_prop(table, "hard_delete")) > 1: + exception_log.append( + SchemaCorruptedException( + schema.name, + f'Found multiple "hard_delete" column hints for table "{table_name}" in' + f' schema "{schema.name}" while only one is allowed:' + f' {", ".join(get_columns_names_with_prop(table, "hard_delete"))}.', + ) + ) + if table.get("write_disposition") in ("replace", "append"): + log( + f"""The "hard_delete" column hint for column "{get_first_column_name_with_prop(table, 'hard_delete')}" """ + f'in table "{table_name}" with write disposition' + f' "{table.get("write_disposition")}"' + f' in schema "{schema.name}" will be ignored.' + ' The "hard_delete" column hint is only applied when using' + ' the "merge" write disposition.' + ) + if has_column_with_prop(table, "dedup_sort"): + if len(get_columns_names_with_prop(table, "dedup_sort")) > 1: + exception_log.append( + SchemaCorruptedException( + schema.name, + f'Found multiple "dedup_sort" column hints for table "{table_name}" in' + f' schema "{schema.name}" while only one is allowed:' + f' {", ".join(get_columns_names_with_prop(table, "dedup_sort"))}.', + ) + ) + if table.get("write_disposition") in ("replace", "append"): + log( + f"""The "dedup_sort" column hint for column "{get_first_column_name_with_prop(table, 'dedup_sort')}" """ + f'in table "{table_name}" with write disposition' + f' "{table.get("write_disposition")}"' + f' in schema "{schema.name}" will be ignored.' + ' The "dedup_sort" column hint is only applied when using' + ' the "merge" write disposition.' + ) + if table.get("write_disposition") == "merge" and not has_column_with_prop( + table, "primary_key" + ): + log( + f"""The "dedup_sort" column hint for column "{get_first_column_name_with_prop(table, 'dedup_sort')}" """ + f'in table "{table_name}" with write disposition' + f' "{table.get("write_disposition")}"' + f' in schema "{schema.name}" will be ignored.' + ' The "dedup_sort" column hint is only applied when a' + " primary key has been specified." + ) + return exception_log + + def _convert_to_old_pyformat( new_style_string: str, args: Tuple[Any, ...], operational_error_cls: Type[Exception] ) -> Tuple[str, Dict[str, Any]]: diff --git a/dlt/extract/__init__.py b/dlt/extract/__init__.py index 03b2e59539..4029241634 100644 --- a/dlt/extract/__init__.py +++ b/dlt/extract/__init__.py @@ -4,13 +4,14 @@ from dlt.extract.decorators import source, resource, transformer, defer from dlt.extract.incremental import Incremental from dlt.extract.wrappers import wrap_additional_type -from dlt.extract.extractors import materialize_schema_item +from dlt.extract.extractors import materialize_schema_item, with_file_import __all__ = [ "DltResource", "DltSource", "with_table_name", "with_hints", + "with_file_import", "make_hints", "source", "resource", diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 2bb4a3ce87..ad10ef3ad3 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -35,22 +35,20 @@ from dlt.common.schema.schema import Schema from dlt.common.schema.typing import ( TColumnNames, + TFileFormat, TWriteDisposition, TWriteDispositionConfig, TAnySchemaColumns, TSchemaContract, TTableFormat, ) -from dlt.extract.hints import make_hints -from dlt.extract.utils import ( - simulate_func_call, - wrap_compat_transformer, - wrap_resource_gen, -) from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage from dlt.common.typing import AnyFun, ParamSpec, Concatenate, TDataItem, TDataItems from dlt.common.utils import get_callable_name, get_module_name, is_inner_callable + +from dlt.extract.hints import make_hints +from dlt.extract.utils import simulate_func_call from dlt.extract.exceptions import ( CurrentSourceNotAvailable, DynamicNameNotStandaloneResource, @@ -64,8 +62,6 @@ SourceNotAFunction, CurrentSourceSchemaNotAvailable, ) -from dlt.extract.incremental import IncrementalResourceWrapper - from dlt.extract.items import TTableHintTemplate from dlt.extract.source import DltSource from dlt.extract.resource import DltResource, TUnboundDltResource, TDltResourceImpl @@ -210,16 +206,16 @@ def decorator( source_sections = (known_sections.SOURCES, source_section, effective_name) conf_f = with_config(f, spec=spec, sections=source_sections) - def _eval_rv(_rv: Any) -> TDltSourceImpl: + def _eval_rv(_rv: Any, schema_copy: Schema) -> TDltSourceImpl: """Evaluates return value from the source function or coroutine""" if _rv is None: - raise SourceDataIsNone(schema.name) + raise SourceDataIsNone(schema_copy.name) # if generator, consume it immediately if inspect.isgenerator(_rv): _rv = list(_rv) # convert to source - s = _impl_cls.from_data(schema.clone(update_normalizers=True), source_section, _rv) + s = _impl_cls.from_data(schema_copy, source_section, _rv) # apply hints if max_table_nesting is not None: s.max_table_nesting = max_table_nesting @@ -231,7 +227,10 @@ def _eval_rv(_rv: Any) -> TDltSourceImpl: @wraps(conf_f) def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: """Wrap a regular function, injection context must be a part of the wrap""" - with Container().injectable_context(SourceSchemaInjectableContext(schema)): + # clone the schema passed to decorator, update normalizers, remove processing hints + # NOTE: source may be called several times in many different settings + schema_copy = schema.clone(update_normalizers=True, remove_processing_hints=True) + with Container().injectable_context(SourceSchemaInjectableContext(schema_copy)): # configurations will be accessed in this section in the source proxy = Container()[PipelineContext] pipeline_name = None if not proxy.is_active() else proxy.pipeline().pipeline_name @@ -239,18 +238,21 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: ConfigSectionContext( pipeline_name=pipeline_name, sections=source_sections, - source_state_key=schema.name, + source_state_key=schema_copy.name, ) ): rv = conf_f(*args, **kwargs) - return _eval_rv(rv) + return _eval_rv(rv, schema_copy) @wraps(conf_f) async def _wrap_coro(*args: Any, **kwargs: Any) -> TDltSourceImpl: """In case of co-routine we must wrap the whole injection context in awaitable, there's no easy way to avoid some code duplication """ - with Container().injectable_context(SourceSchemaInjectableContext(schema)): + # clone the schema passed to decorator, update normalizers, remove processing hints + # NOTE: source may be called several times in many different settings + schema_copy = schema.clone(update_normalizers=True, remove_processing_hints=True) + with Container().injectable_context(SourceSchemaInjectableContext(schema_copy)): # configurations will be accessed in this section in the source proxy = Container()[PipelineContext] pipeline_name = None if not proxy.is_active() else proxy.pipeline().pipeline_name @@ -258,11 +260,11 @@ async def _wrap_coro(*args: Any, **kwargs: Any) -> TDltSourceImpl: ConfigSectionContext( pipeline_name=pipeline_name, sections=source_sections, - source_state_key=schema.name, + source_state_key=schema_copy.name, ) ): rv = await conf_f(*args, **kwargs) - return _eval_rv(rv) + return _eval_rv(rv, schema_copy) # get spec for wrapped function SPEC = get_fun_spec(conf_f) @@ -296,6 +298,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -316,6 +319,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -336,6 +340,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -359,6 +364,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -378,6 +384,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -413,9 +420,10 @@ def resource( If not present, the name of the decorated function will be used. table_name (TTableHintTemplate[str], optional): An table name, if different from `name`. - max_table_nesting (int, optional): A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. + max_table_nesting (int, optional): A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. + write_disposition (TTableHintTemplate[TWriteDispositionConfig], optional): Controls how to write data to a table. Accepts a shorthand string literal or configuration dictionary. Allowed shorthand string literals: `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". Write behaviour can be further customized through a configuration dictionary. For example, to obtain an SCD2 table provide `write_disposition={"disposition": "merge", "strategy": "scd2"}`. @@ -433,7 +441,12 @@ def resource( This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. schema_contract (TSchemaContract, optional): Schema contract settings that will be applied to all resources of this source (if not overridden in the resource itself) - table_format (Literal["iceberg"], optional): Defines the storage format of the table. Currently only "iceberg" is supported on Athena, other destinations ignore this hint. + + table_format (Literal["iceberg", "delta"], optional): Defines the storage format of the table. Currently only "iceberg" is supported on Athena, and "delta" on the filesystem. + Other destinations ignore this hint. + + file_format (Literal["preferred", ...], optional): Format of the file in which resource data is stored. Useful when importing external files. Use `preferred` to force + a file format that is preferred by the destination used. This setting superseded the `load_file_format` passed to pipeline `run` method. selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. @@ -464,6 +477,7 @@ def make_resource(_name: str, _section: str, _data: Any) -> TDltResourceImpl: merge_key=merge_key, schema_contract=schema_contract, table_format=table_format, + file_format=file_format, ) resource = _impl_cls.from_data( @@ -574,10 +588,14 @@ def transformer( data_from: TUnboundDltResource = DltResource.Empty, name: str = None, table_name: TTableHintTemplate[str] = None, + max_table_nesting: int = None, write_disposition: TTableHintTemplate[TWriteDisposition] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, + table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -591,10 +609,14 @@ def transformer( data_from: TUnboundDltResource = DltResource.Empty, name: TTableHintTemplate[str] = None, table_name: TTableHintTemplate[str] = None, + max_table_nesting: int = None, write_disposition: TTableHintTemplate[TWriteDisposition] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, + table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -612,10 +634,14 @@ def transformer( data_from: TUnboundDltResource = DltResource.Empty, name: str = None, table_name: TTableHintTemplate[str] = None, + max_table_nesting: int = None, write_disposition: TTableHintTemplate[TWriteDisposition] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, + table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -629,10 +655,14 @@ def transformer( data_from: TUnboundDltResource = DltResource.Empty, name: TTableHintTemplate[str] = None, table_name: TTableHintTemplate[str] = None, + max_table_nesting: int = None, write_disposition: TTableHintTemplate[TWriteDisposition] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, + table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -646,10 +676,14 @@ def transformer( data_from: TUnboundDltResource = DltResource.Empty, name: TTableHintTemplate[str] = None, table_name: TTableHintTemplate[str] = None, + max_table_nesting: int = None, write_disposition: TTableHintTemplate[TWriteDisposition] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, + table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, @@ -692,6 +726,8 @@ def transformer( table_name (TTableHintTemplate[str], optional): An table name, if different from `name`. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. + max_table_nesting (int, optional): A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON. + write_disposition (Literal["skip", "append", "replace", "merge"], optional): Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. @@ -704,6 +740,14 @@ def transformer( merge_key (str | Sequence[str]): A column name or a list of column names that define a merge key. Typically used with "merge" write disposition to remove overlapping data ranges ie. to keep a single record for a given day. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. + schema_contract (TSchemaContract, optional): Schema contract settings that will be applied to all resources of this source (if not overridden in the resource itself) + + table_format (Literal["iceberg", "delta"], optional): Defines the storage format of the table. Currently only "iceberg" is supported on Athena, and "delta" on the filesystem. + Other destinations ignore this hint. + + file_format (Literal["preferred", ...], optional): Format of the file in which resource data is stored. Useful when importing external files. Use `preferred` to force + a file format that is preferred by the destination used. This setting superseded the `load_file_format` passed to pipeline `run` method. + selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. @@ -722,10 +766,14 @@ def transformer( f, name=name, table_name=table_name, + max_table_nesting=max_table_nesting, write_disposition=write_disposition, columns=columns, primary_key=primary_key, merge_key=merge_key, + schema_contract=schema_contract, + table_format=table_format, + file_format=file_format, selected=selected, spec=spec, standalone=standalone, @@ -741,8 +789,11 @@ def _maybe_load_schema_for_callable(f: AnyFun, name: str) -> Optional[Schema]: try: file = inspect.getsourcefile(f) if file: - return SchemaStorage.load_schema_file(os.path.dirname(file), name) - + schema = SchemaStorage.load_schema_file( + os.path.dirname(file), name, remove_processing_hints=True + ) + schema.update_normalizers() + return schema except SchemaNotFoundError: pass return None diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index f8966c3ced..5769be1a8d 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -170,6 +170,9 @@ def add_item(item: Any) -> bool: class Extract(WithStepInfo[ExtractMetrics, ExtractInfo]): + original_data: Any + """Original data from which the extracted DltSource was created. Will be used to describe in extract info""" + def __init__( self, schema_storage: SchemaStorage, @@ -181,6 +184,7 @@ def __init__( self.collector = collector self.schema_storage = schema_storage self.extract_storage = ExtractStorage(normalize_storage_config) + # TODO: this should be passed together with DltSource to extract() self.original_data: Any = original_data super().__init__() @@ -370,7 +374,9 @@ def extract( load_package_state_update: Optional[Dict[str, Any]] = None, ) -> str: # generate load package to be able to commit all the sources together later - load_id = self.extract_storage.create_load_package(source.discover_schema()) + load_id = self.extract_storage.create_load_package( + source.discover_schema(), reuse_exiting_package=True + ) with Container().injectable_context( SourceSchemaInjectableContext(source.schema) ), Container().injectable_context( @@ -405,14 +411,10 @@ def extract( commit_load_package_state() return load_id - def commit_packages(self, pipline_state_doc: TPipelineStateDoc = None) -> None: - """Commits all extracted packages to normalize storage, and adds the pipeline state to the load package""" + def commit_packages(self) -> None: + """Commits all extracted packages to normalize storage""" # commit load packages for load_id, metrics in self._load_id_metrics.items(): - if pipline_state_doc: - package_state = self.extract_storage.new_packages.get_load_package_state(load_id) - package_state["pipeline_state"] = {**pipline_state_doc, "dlt_load_id": load_id} - self.extract_storage.new_packages.save_load_package_state(load_id, package_state) self.extract_storage.commit_new_load_package( load_id, self.schema_storage[metrics[0]["schema_name"]] ) diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 48f0d6968e..4a1de2517d 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -1,14 +1,14 @@ from copy import copy -from typing import Set, Dict, Any, Optional, List +from typing import Set, Dict, Any, Optional, List, Union from dlt.common.configuration import known_sections, resolve_configuration, with_config from dlt.common import logger from dlt.common.configuration.specs import BaseConfiguration, configspec +from dlt.common.data_writers import DataWriterMetrics from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.exceptions import MissingDependencyException - from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.typing import TDataItems, TDataItem +from dlt.common.typing import TDataItems, TDataItem, TLoaderFileFormat from dlt.common.schema import Schema, utils from dlt.common.schema.typing import ( TSchemaContractDict, @@ -17,9 +17,9 @@ TTableSchemaColumns, TPartialTableSchema, ) -from dlt.extract.hints import HintsMeta +from dlt.extract.hints import HintsMeta, TResourceHints from dlt.extract.resource import DltResource -from dlt.extract.items import TableNameMeta +from dlt.extract.items import DataItemWithMeta, TableNameMeta from dlt.extract.storage import ExtractorItemStorage from dlt.normalize.configuration import ItemsNormalizerConfiguration @@ -47,6 +47,50 @@ def materialize_schema_item() -> MaterializedEmptyList: return MaterializedEmptyList() +class ImportFileMeta(HintsMeta): + __slots__ = ("file_path", "metrics", "file_format") + + def __init__( + self, + file_path: str, + metrics: DataWriterMetrics, + file_format: TLoaderFileFormat = None, + hints: TResourceHints = None, + create_table_variant: bool = None, + ) -> None: + super().__init__(hints, create_table_variant) + self.file_path = file_path + self.metrics = metrics + self.file_format = file_format + + +def with_file_import( + file_path: str, + file_format: TLoaderFileFormat, + items_count: int = 0, + hints: Union[TResourceHints, TDataItem] = None, +) -> DataItemWithMeta: + """Marks file under `file_path` to be associated with current resource and imported into the load package as a file of + type `file_format`. + + You can provide optional `hints` that will be applied to the current resource. Note that you should avoid schema inference at + runtime if possible and if that is not possible - to do that only once per extract process. Use `make_hints` in `mark` module + to create hints. You can also pass Arrow table or Pandas data frame form which schema will be taken (but content discarded). + Create `TResourceHints` with `make_hints`. + + If number of records in `file_path` is known, pass it in `items_count` so `dlt` can generate correct extract metrics. + + Note that `dlt` does not sniff schemas from data and will not guess right file format for you. + """ + metrics = DataWriterMetrics(file_path, items_count, 0, 0, 0) + item: TDataItem = None + # if hints are dict assume that this is dlt schema, if not - that it is arrow table + if not isinstance(hints, dict): + item = hints + hints = None + return DataItemWithMeta(ImportFileMeta(file_path, metrics, file_format, hints, False), item) + + class Extractor: @configspec class ExtractorConfiguration(BaseConfiguration): @@ -78,7 +122,7 @@ def __init__( def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: """Write `items` to `resource` optionally computing table schemas and revalidating/filtering data""" - if isinstance(meta, HintsMeta): + if isinstance(meta, HintsMeta) and meta.hints: # update the resource with new hints, remove all caches so schema is recomputed # and contracts re-applied resource.merge_hints(meta.hints, meta.create_table_variant) @@ -93,7 +137,7 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No self._write_to_static_table(resource, table_name, items, meta) else: # table has name or other hints depending on data items - self._write_to_dynamic_table(resource, items) + self._write_to_dynamic_table(resource, items, meta) def write_empty_items_file(self, table_name: str) -> None: table_name = self.naming.normalize_table_identifier(table_name) @@ -129,7 +173,24 @@ def _write_item( if isinstance(items, MaterializedEmptyList): self.resources_with_empty.add(resource_name) - def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> None: + def _import_item( + self, + table_name: str, + resource_name: str, + meta: ImportFileMeta, + ) -> None: + metrics = self.item_storage.import_items_file( + self.load_id, + self.schema.name, + table_name, + meta.file_path, + meta.metrics, + meta.file_format, + ) + self.collector.update(table_name, inc=metrics.items_count) + self.resources_with_items.add(resource_name) + + def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: if not isinstance(items, list): items = [items] @@ -143,7 +204,10 @@ def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> N ) # write to storage with inferred table name if table_name not in self._filtered_tables: - self._write_item(table_name, resource.name, item) + if isinstance(meta, ImportFileMeta): + self._import_item(table_name, resource.name, meta) + else: + self._write_item(table_name, resource.name, item) def _write_to_static_table( self, resource: DltResource, table_name: str, items: TDataItems, meta: Any @@ -151,11 +215,16 @@ def _write_to_static_table( if table_name not in self._table_contracts: items = self._compute_and_update_table(resource, table_name, items, meta) if table_name not in self._filtered_tables: - self._write_item(table_name, resource.name, items) + if isinstance(meta, ImportFileMeta): + self._import_item(table_name, resource.name, meta) + else: + self._write_item(table_name, resource.name, items) def _compute_table(self, resource: DltResource, items: TDataItems, meta: Any) -> TTableSchema: """Computes a schema for a new or dynamic table and normalizes identifiers""" - return self.schema.normalize_table_identifiers(resource.compute_table_schema(items, meta)) + return utils.normalize_table_identifiers( + resource.compute_table_schema(items, meta), self.schema.naming + ) def _compute_and_update_table( self, resource: DltResource, table_name: str, items: TDataItems, meta: Any @@ -173,11 +242,11 @@ def _compute_and_update_table( # this is a new table so allow evolve once if schema_contract["columns"] != "evolve" and self.schema.is_new_table(table_name): - computed_table["x-normalizer"] = {"evolve-columns-once": True} # type: ignore[typeddict-unknown-key] + computed_table["x-normalizer"] = {"evolve-columns-once": True} existing_table = self.schema._schema_tables.get(table_name, None) if existing_table: # TODO: revise this. computed table should overwrite certain hints (ie. primary and merge keys) completely - diff_table = utils.diff_table(existing_table, computed_table) + diff_table = utils.diff_table(self.schema.name, existing_table, computed_table) else: diff_table = computed_table @@ -335,7 +404,7 @@ def _compute_table( computed_table = super()._compute_table(resource, item, Any) # Merge the columns to include primary_key and other hints that may be set on the resource if arrow_table: - utils.merge_table(computed_table, arrow_table) + utils.merge_table(self.schema.name, computed_table, arrow_table) else: arrow_table = copy(computed_table) arrow_table["columns"] = pyarrow.py_arrow_to_table_schema_columns(item.schema) @@ -353,8 +422,7 @@ def _compute_table( } # normalize arrow table before merging - arrow_table = self.schema.normalize_table_identifiers(arrow_table) - + arrow_table = utils.normalize_table_identifiers(arrow_table, self.schema.naming) # issue warnings when overriding computed with arrow override_warn: bool = False for col_name, column in arrow_table["columns"].items(): diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 6fd1928970..bc10177223 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -5,6 +5,7 @@ from dlt.common.schema.typing import ( TColumnNames, TColumnProp, + TFileFormat, TPartialTableSchema, TTableSchema, TTableSchemaColumns, @@ -48,6 +49,7 @@ class TResourceHints(TypedDict, total=False): incremental: Incremental[Any] schema_contract: TTableHintTemplate[TSchemaContract] table_format: TTableHintTemplate[TTableFormat] + file_format: TTableHintTemplate[TFileFormat] validator: ValidateItem original_columns: TTableHintTemplate[TAnySchemaColumns] @@ -72,6 +74,7 @@ def make_hints( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, ) -> TResourceHints: """A convenience function to create resource hints. Accepts both static and dynamic hints based on data. @@ -91,6 +94,7 @@ def make_hints( columns=clean_columns, # type: ignore schema_contract=schema_contract, # type: ignore table_format=table_format, # type: ignore + file_format=file_format, # type: ignore ) if not table_name: new_template.pop("name") @@ -209,6 +213,7 @@ def apply_hints( schema_contract: TTableHintTemplate[TSchemaContract] = None, additional_table_hints: Optional[Dict[str, TTableHintTemplate[Any]]] = None, table_format: TTableHintTemplate[TTableFormat] = None, + file_format: TTableHintTemplate[TFileFormat] = None, create_table_variant: bool = False, ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -256,6 +261,7 @@ def apply_hints( merge_key, schema_contract, table_format, + file_format, ) else: t = self._clone_hints(t) @@ -320,6 +326,11 @@ def apply_hints( t["table_format"] = table_format else: t.pop("table_format", None) + if file_format is not None: + if file_format: + t["file_format"] = file_format + else: + t.pop("file_format", None) # set properties that can't be passed to make_hints if incremental is not None: @@ -375,6 +386,7 @@ def merge_hints( incremental=hints_template.get("incremental"), schema_contract=hints_template.get("schema_contract"), table_format=hints_template.get("table_format"), + file_format=hints_template.get("file_format"), create_table_variant=create_table_variant, ) diff --git a/dlt/extract/items.py b/dlt/extract/items.py index fec31e2846..4cf8d2191f 100644 --- a/dlt/extract/items.py +++ b/dlt/extract/items.py @@ -160,6 +160,10 @@ class FilterItem(ItemTransform[bool]): def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: if isinstance(item, list): + # preserve empty lists + if len(item) == 0: + return item + if self._f_meta: item = [i for i in item if self._f_meta(i, meta)] else: diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index eecb570375..93eb9d1189 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -1,4 +1,3 @@ -from copy import deepcopy import inspect from functools import partial from typing import ( @@ -14,6 +13,7 @@ ) from typing_extensions import TypeVar, Self +from dlt.common import logger from dlt.common.configuration.inject import get_fun_spec, with_config from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs import known_sections @@ -394,6 +394,11 @@ def _gen_wrap(gen: TPipeStep) -> TPipeStep: else: # keep function as function to not evaluate generators before pipe starts self._pipe.replace_gen(partial(_gen_wrap, gen)) + else: + logger.warning( + f"Setting add_limit to a transformer {self.name} has no effect. Set the limit on" + " the top level resource." + ) return self def parallelize(self: TDltResourceImpl) -> TDltResourceImpl: diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 658f884c40..9953b56117 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -11,6 +11,7 @@ from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer from dlt.common.schema import Schema from dlt.common.schema.typing import TColumnName, TSchemaContract +from dlt.common.schema.utils import normalize_table_identifiers from dlt.common.typing import StrAny, TDataItem from dlt.common.configuration.container import Container from dlt.common.pipeline import ( @@ -245,26 +246,39 @@ def exhausted(self) -> bool: @property def root_key(self) -> bool: """Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge""" + # this also check the normalizer type config = RelationalNormalizer.get_normalizer_config(self._schema).get("propagation") + data_normalizer = self._schema.data_item_normalizer + assert isinstance(data_normalizer, RelationalNormalizer) return ( config is not None and "root" in config - and "_dlt_id" in config["root"] - and config["root"]["_dlt_id"] == "_dlt_root_id" + and data_normalizer.c_dlt_id in config["root"] + and config["root"][data_normalizer.c_dlt_id] == data_normalizer.c_dlt_root_id ) @root_key.setter def root_key(self, value: bool) -> None: + # this also check the normalizer type + config = RelationalNormalizer.get_normalizer_config(self._schema) + data_normalizer = self._schema.data_item_normalizer + assert isinstance(data_normalizer, RelationalNormalizer) + if value is True: RelationalNormalizer.update_normalizer_config( - self._schema, {"propagation": {"root": {"_dlt_id": TColumnName("_dlt_root_id")}}} + self._schema, + { + "propagation": { + "root": { + data_normalizer.c_dlt_id: TColumnName(data_normalizer.c_dlt_root_id) + } + } + }, ) else: if self.root_key: - propagation_config = RelationalNormalizer.get_normalizer_config(self._schema)[ - "propagation" - ] - propagation_config["root"].pop("_dlt_id") # type: ignore + propagation_config = config["propagation"] + propagation_config["root"].pop(data_normalizer.c_dlt_id) @property def resources(self) -> DltResourceDict: @@ -291,8 +305,8 @@ def discover_schema(self, item: TDataItem = None) -> Schema: for r in self.selected_resources.values(): # names must be normalized here with contextlib.suppress(DataItemRequiredForDynamicTableHints): - partial_table = self._schema.normalize_table_identifiers( - r.compute_table_schema(item) + partial_table = normalize_table_identifiers( + r.compute_table_schema(item), self._schema.naming ) schema.update_table(partial_table) return schema diff --git a/dlt/load/configuration.py b/dlt/load/configuration.py index 8abc679ea2..836da516e9 100644 --- a/dlt/load/configuration.py +++ b/dlt/load/configuration.py @@ -1,11 +1,10 @@ -from typing import TYPE_CHECKING, Literal, Optional +from typing import Optional from dlt.common.configuration import configspec +from dlt.common.destination.capabilities import TLoaderParallelismStrategy from dlt.common.storages import LoadStorageConfiguration from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType -TLoaderParallelismStrategy = Literal["parallel", "table-sequential", "sequential"] - @configspec class LoaderConfiguration(PoolRunnerConfiguration): diff --git a/dlt/load/load.py b/dlt/load/load.py index abbeee5ddf..9d1d953f7f 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -80,7 +80,6 @@ def __init__( self.initial_client_config = initial_client_config self.initial_staging_client_config = initial_staging_client_config self.destination = destination - self.capabilities = destination.capabilities() self.staging_destination = staging_destination self.pool = NullExecutor() self.load_storage: LoadStorage = self.create_storage(is_storage_owner) @@ -88,7 +87,7 @@ def __init__( super().__init__() def create_storage(self, is_storage_owner: bool) -> LoadStorage: - supported_file_formats = self.capabilities.supported_loader_file_formats + supported_file_formats = self.destination.capabilities().supported_loader_file_formats if self.staging_destination: supported_file_formats = ( self.staging_destination.capabilities().supported_loader_file_formats @@ -150,7 +149,7 @@ def w_spool_job( if job_info.file_format not in self.load_storage.supported_job_file_formats: raise LoadClientUnsupportedFileFormats( job_info.file_format, - self.capabilities.supported_loader_file_formats, + self.destination.capabilities().supported_loader_file_formats, file_path, ) logger.info(f"Will load file {file_path} with table name {job_info.table_name}") @@ -197,7 +196,7 @@ def w_spool_job( def spool_new_jobs(self, load_id: str, schema: Schema) -> Tuple[int, List[LoadJob]]: # use thread based pool as jobs processing is mostly I/O and we do not want to pickle jobs load_files = filter_new_jobs( - self.load_storage.list_new_jobs(load_id), self.capabilities, self.config + self.load_storage.list_new_jobs(load_id), self.destination.capabilities(), self.config ) file_count = len(load_files) if file_count == 0: @@ -259,13 +258,20 @@ def create_followup_jobs( schema.tables, starting_job.job_file_info().table_name ) # if all tables of chain completed, create follow up jobs - all_jobs = self.load_storage.normalized_packages.list_all_jobs(load_id) + all_jobs_states = self.load_storage.normalized_packages.list_all_jobs_with_states( + load_id + ) if table_chain := get_completed_table_chain( - schema, all_jobs, top_job_table, starting_job.job_file_info().job_id() + schema, all_jobs_states, top_job_table, starting_job.job_file_info().job_id() ): table_chain_names = [table["name"] for table in table_chain] + # create job infos that contain full path to job table_chain_jobs = [ - job for job in all_jobs if job.job_file_info.table_name in table_chain_names + self.load_storage.normalized_packages.job_to_job_info(load_id, *job_state) + for job_state in all_jobs_states + if job_state[1].table_name in table_chain_names + # job being completed is still in started_jobs + and job_state[0] in ("completed_jobs", "started_jobs") ] if follow_up_jobs := client.create_table_chain_completed_followup_jobs( table_chain, table_chain_jobs @@ -359,7 +365,7 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) ) ): job_client.complete_load(load_id) - self._maybe_trancate_staging_dataset(schema, job_client) + self._maybe_truncate_staging_dataset(schema, job_client) self.load_storage.complete_load_package(load_id, aborted) # collect package info @@ -432,10 +438,10 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: self.complete_package(load_id, schema, False) return # update counter we only care about the jobs that are scheduled to be loaded - package_info = self.load_storage.normalized_packages.get_load_package_info(load_id) - total_jobs = reduce(lambda p, c: p + len(c), package_info.jobs.values(), 0) - no_failed_jobs = len(package_info.jobs["failed_jobs"]) - no_completed_jobs = len(package_info.jobs["completed_jobs"]) + no_failed_jobs + package_jobs = self.load_storage.normalized_packages.get_load_package_jobs(load_id) + total_jobs = reduce(lambda p, c: p + len(c), package_jobs.values(), 0) + no_failed_jobs = len(package_jobs["failed_jobs"]) + no_completed_jobs = len(package_jobs["completed_jobs"]) + no_failed_jobs self.collector.update("Jobs", no_completed_jobs, total_jobs) if no_failed_jobs > 0: self.collector.update( @@ -447,26 +453,28 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: remaining_jobs = self.complete_jobs(load_id, jobs, schema) if len(remaining_jobs) == 0: # get package status - package_info = self.load_storage.normalized_packages.get_load_package_info( + package_jobs = self.load_storage.normalized_packages.get_load_package_jobs( load_id ) # possibly raise on failed jobs if self.config.raise_on_failed_jobs: - if package_info.jobs["failed_jobs"]: - failed_job = package_info.jobs["failed_jobs"][0] + if package_jobs["failed_jobs"]: + failed_job = package_jobs["failed_jobs"][0] raise LoadClientJobFailed( load_id, - failed_job.job_file_info.job_id(), - failed_job.failed_message, + failed_job.job_id(), + self.load_storage.normalized_packages.get_job_failed_message( + load_id, failed_job + ), ) # possibly raise on too many retries if self.config.raise_on_max_retries: - for new_job in package_info.jobs["new_jobs"]: - r_c = new_job.job_file_info.retry_count + for new_job in package_jobs["new_jobs"]: + r_c = new_job.retry_count if r_c > 0 and r_c % self.config.raise_on_max_retries == 0: raise LoadClientJobRetry( load_id, - new_job.job_file_info.job_id(), + new_job.job_id(), r_c, self.config.raise_on_max_retries, ) @@ -512,7 +520,7 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: return TRunMetrics(False, len(self.load_storage.list_normalized_packages())) - def _maybe_trancate_staging_dataset(self, schema: Schema, job_client: JobClientBase) -> None: + def _maybe_truncate_staging_dataset(self, schema: Schema, job_client: JobClientBase) -> None: """ Truncate the staging dataset if one used, and configuration requests truncation. diff --git a/dlt/load/utils.py b/dlt/load/utils.py index 4e5099855b..7db05674fa 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -1,8 +1,8 @@ -from typing import List, Set, Iterable, Callable, Optional, Sequence +from typing import List, Set, Iterable, Callable, Optional, Tuple, Sequence from itertools import groupby from dlt.common import logger -from dlt.common.storages.load_package import LoadJobInfo, PackageStorage +from dlt.common.storages.load_package import LoadJobInfo, PackageStorage, TJobState from dlt.common.schema.utils import ( fill_hints_from_parent_and_clone_table, get_child_tables, @@ -22,7 +22,7 @@ def get_completed_table_chain( schema: Schema, - all_jobs: Iterable[LoadJobInfo], + all_jobs: Iterable[Tuple[TJobState, ParsedLoadJobFileName]], top_merged_table: TTableSchema, being_completed_job_id: str = None, ) -> List[TTableSchema]: @@ -54,8 +54,8 @@ def get_completed_table_chain( else: # all jobs must be completed in order for merge to be created if any( - job.state not in ("failed_jobs", "completed_jobs") - and job.job_file_info.job_id() != being_completed_job_id + job[0] not in ("failed_jobs", "completed_jobs") + and job[1].job_id() != being_completed_job_id for job in table_jobs ): return None diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 6678f6edee..5f84d57d7a 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -6,6 +6,7 @@ from dlt.common.data_writers import DataWriterMetrics from dlt.common.data_writers.writers import ArrowToObjectAdapter from dlt.common.json import custom_pua_decode, may_have_pua +from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer from dlt.common.runtime import signals from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns, TSchemaContractDict from dlt.common.schema.utils import has_table_seen_data @@ -149,7 +150,7 @@ def _normalize_chunk( continue # theres a new table or new columns in existing table # update schema and save the change - schema.update_table(partial_table) + schema.update_table(partial_table, normalize_identifiers=False) table_updates = schema_update.setdefault(table_name, []) table_updates.append(partial_table) @@ -200,6 +201,7 @@ def __call__( ) schema_updates.append(partial_update) logger.debug(f"Processed {line_no+1} lines from file {extracted_items_file}") + # empty json files are when replace write disposition is used in order to truncate table(s) if line is None and root_table_name in self.schema.tables: # TODO: we should push the truncate jobs via package state # not as empty jobs. empty jobs should be reserved for @@ -234,8 +236,9 @@ def _write_with_dlt_columns( schema = self.schema load_id = self.load_id schema_update: TSchemaUpdate = {} + data_normalizer = schema.data_item_normalizer - if add_dlt_id: + if add_dlt_id and isinstance(data_normalizer, RelationalNormalizer): table_update = schema.update_table( { "name": root_table_name, @@ -249,7 +252,7 @@ def _write_with_dlt_columns( new_columns.append( ( -1, - pa.field("_dlt_id", pyarrow.pyarrow.string(), nullable=False), + pa.field(data_normalizer.c_dlt_id, pyarrow.pyarrow.string(), nullable=False), lambda batch: pa.array(generate_dlt_ids(batch.num_rows)), ) ) @@ -375,3 +378,32 @@ def __call__(self, extracted_items_file: str, root_table_name: str) -> List[TSch ) return base_schema_update + + +class FileImportNormalizer(ItemsNormalizer): + def __call__(self, extracted_items_file: str, root_table_name: str) -> List[TSchemaUpdate]: + logger.info( + f"Table {root_table_name} {self.item_storage.writer_spec.file_format} file" + f" {extracted_items_file} will be directly imported without normalization" + ) + completed_columns = self.schema.get_table_columns(root_table_name) + if not completed_columns: + logger.warning( + f"Table {root_table_name} has no completed columns for imported file" + f" {extracted_items_file} and will not be created! Pass column hints to the" + " resource or with dlt.mark.with_hints or create the destination table yourself." + ) + with self.normalize_storage.extracted_packages.storage.open_file( + extracted_items_file, "rb" + ) as f: + # TODO: sniff the schema depending on a file type + file_metrics = DataWriterMetrics(extracted_items_file, 0, f.tell(), 0, 0) + parts = ParsedLoadJobFileName.parse(extracted_items_file) + self.item_storage.import_items_file( + self.load_id, + self.schema.name, + parts.table_name, + self.normalize_storage.extracted_packages.storage.make_full_path(extracted_items_file), + file_metrics, + ) + return [] diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 75cb9be707..98154cd5cf 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -1,33 +1,23 @@ import os import itertools -from typing import Callable, List, Dict, NamedTuple, Sequence, Tuple, Set, Optional +from typing import List, Dict, Sequence, Optional, Callable from concurrent.futures import Future, Executor from dlt.common import logger from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config -from dlt.common.configuration.container import Container -from dlt.common.data_writers import ( - DataWriter, - DataWriterMetrics, - TDataItemFormat, - resolve_best_writer_spec, - get_best_writer_spec, - is_native_writer, -) +from dlt.common.data_writers import DataWriterMetrics from dlt.common.data_writers.writers import EMPTY_DATA_WRITER_METRICS from dlt.common.runners import TRunMetrics, Runnable, NullExecutor from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.schema.typing import TStoredSchema, TTableSchema +from dlt.common.schema.typing import TStoredSchema from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages import ( NormalizeStorage, SchemaStorage, LoadStorage, - LoadStorageConfiguration, - NormalizeStorageConfiguration, ParsedLoadJobFileName, ) from dlt.common.schema import TSchemaUpdate, Schema @@ -40,20 +30,10 @@ ) from dlt.common.storages.exceptions import LoadPackageNotFound from dlt.common.storages.load_package import LoadPackageInfo -from dlt.common.utils import chunks from dlt.normalize.configuration import NormalizeConfiguration from dlt.normalize.exceptions import NormalizeJobFailed -from dlt.normalize.items_normalizers import ( - ArrowItemsNormalizer, - JsonLItemsNormalizer, - ItemsNormalizer, -) - - -class TWorkerRV(NamedTuple): - schema_updates: List[TSchemaUpdate] - file_metrics: List[DataWriterMetrics] +from dlt.normalize.worker import w_normalize_files, group_worker_files, TWorkerRV # normalize worker wrapping function signature @@ -99,211 +79,19 @@ def create_storages(self) -> None: config=self.config._load_storage_config, ) - @staticmethod - def w_normalize_files( - config: NormalizeConfiguration, - normalize_storage_config: NormalizeStorageConfiguration, - loader_storage_config: LoadStorageConfiguration, - stored_schema: TStoredSchema, - load_id: str, - extracted_items_files: Sequence[str], - ) -> TWorkerRV: - destination_caps = config.destination_capabilities - schema_updates: List[TSchemaUpdate] = [] - # normalizers are cached per table name - item_normalizers: Dict[str, ItemsNormalizer] = {} - - preferred_file_format = ( - destination_caps.preferred_loader_file_format - or destination_caps.preferred_staging_file_format - ) - # TODO: capabilities.supported_*_formats can be None, it should have defaults - supported_file_formats = destination_caps.supported_loader_file_formats or [] - supported_table_formats = destination_caps.supported_table_formats or [] - - # process all files with data items and write to buffered item storage - with Container().injectable_context(destination_caps): - schema = Schema.from_stored_schema(stored_schema) - normalize_storage = NormalizeStorage(False, normalize_storage_config) - load_storage = LoadStorage(False, supported_file_formats, loader_storage_config) - - def _get_items_normalizer( - item_format: TDataItemFormat, table_schema: Optional[TTableSchema] - ) -> ItemsNormalizer: - table_name = table_schema["name"] - if table_name in item_normalizers: - return item_normalizers[table_name] - - if ( - "table_format" in table_schema - and table_schema["table_format"] not in supported_table_formats - ): - logger.warning( - "Destination does not support the configured `table_format` value " - f"`{table_schema['table_format']}` for table `{table_schema['name']}`. " - "The setting will probably be ignored." - ) - - items_preferred_file_format = preferred_file_format - items_supported_file_formats = supported_file_formats - if destination_caps.loader_file_format_adapter is not None: - items_preferred_file_format, items_supported_file_formats = ( - destination_caps.loader_file_format_adapter( - preferred_file_format, - ( - supported_file_formats.copy() - if isinstance(supported_file_formats, list) - else supported_file_formats - ), - table_schema=table_schema, - ) - ) - - # force file format - best_writer_spec = None - if config.loader_file_format: - if config.loader_file_format in items_supported_file_formats: - # TODO: pass supported_file_formats, when used in pipeline we already checked that - # but if normalize is used standalone `supported_loader_file_formats` may be unresolved - best_writer_spec = get_best_writer_spec( - item_format, config.loader_file_format - ) - else: - logger.warning( - f"The configured value `{config.loader_file_format}` " - "for `loader_file_format` is not supported for table " - f"`{table_schema['name']}` and will be ignored. Dlt " - "will use a supported format instead." - ) - - if best_writer_spec is None: - # find best spec among possible formats taking into account destination preference - best_writer_spec = resolve_best_writer_spec( - item_format, items_supported_file_formats, items_preferred_file_format - ) - # if best_writer_spec.file_format != preferred_file_format: - # logger.warning( - # f"For data items yielded as {item_format} jobs in file format" - # f" {preferred_file_format} cannot be created." - # f" {best_writer_spec.file_format} jobs will be used instead." - # " This may decrease the performance." - # ) - item_storage = load_storage.create_item_storage(best_writer_spec) - if not is_native_writer(item_storage.writer_cls): - logger.warning( - f"For data items yielded as {item_format} and job file format" - f" {best_writer_spec.file_format} native writer could not be found. A" - f" {item_storage.writer_cls.__name__} writer is used that internally" - f" converts {item_format}. This will degrade performance." - ) - cls = ArrowItemsNormalizer if item_format == "arrow" else JsonLItemsNormalizer - logger.info( - f"Created items normalizer {cls.__name__} with writer" - f" {item_storage.writer_cls.__name__} for item format {item_format} and file" - f" format {item_storage.writer_spec.file_format}" - ) - norm = item_normalizers[table_name] = cls( - item_storage, - normalize_storage, - schema, - load_id, - config, - ) - return norm - - def _gather_metrics_and_close( - parsed_fn: ParsedLoadJobFileName, in_exception: bool - ) -> List[DataWriterMetrics]: - writer_metrics: List[DataWriterMetrics] = [] - try: - try: - for normalizer in item_normalizers.values(): - normalizer.item_storage.close_writers(load_id, skip_flush=in_exception) - except Exception: - # if we had exception during flushing the writers, close them without flushing - if not in_exception: - for normalizer in item_normalizers.values(): - normalizer.item_storage.close_writers(load_id, skip_flush=True) - raise - finally: - # always gather metrics - for normalizer in item_normalizers.values(): - norm_metrics = normalizer.item_storage.closed_files(load_id) - writer_metrics.extend(norm_metrics) - for normalizer in item_normalizers.values(): - normalizer.item_storage.remove_closed_files(load_id) - except Exception as exc: - if in_exception: - # swallow exception if we already handle exceptions - return writer_metrics - else: - # enclose the exception during the closing in job failed exception - job_id = parsed_fn.job_id() if parsed_fn else "" - raise NormalizeJobFailed(load_id, job_id, str(exc), writer_metrics) - return writer_metrics - - parsed_file_name: ParsedLoadJobFileName = None - try: - root_tables: Set[str] = set() - for extracted_items_file in extracted_items_files: - parsed_file_name = ParsedLoadJobFileName.parse(extracted_items_file) - # normalize table name in case the normalization changed - # NOTE: this is the best we can do, until a full lineage information is in the schema - root_table_name = schema.naming.normalize_table_identifier( - parsed_file_name.table_name - ) - root_tables.add(root_table_name) - normalizer = _get_items_normalizer( - DataWriter.item_format_from_file_extension(parsed_file_name.file_format), - stored_schema["tables"].get(root_table_name, {"name": root_table_name}), - ) - logger.debug( - f"Processing extracted items in {extracted_items_file} in load_id" - f" {load_id} with table name {root_table_name} and schema {schema.name}" - ) - partial_updates = normalizer(extracted_items_file, root_table_name) - schema_updates.extend(partial_updates) - logger.debug(f"Processed file {extracted_items_file}") - except Exception as exc: - job_id = parsed_file_name.job_id() if parsed_file_name else "" - writer_metrics = _gather_metrics_and_close(parsed_file_name, in_exception=True) - raise NormalizeJobFailed(load_id, job_id, str(exc), writer_metrics) from exc - else: - writer_metrics = _gather_metrics_and_close(parsed_file_name, in_exception=False) - - logger.info(f"Processed all items in {len(extracted_items_files)} files") - return TWorkerRV(schema_updates, writer_metrics) - - def update_table(self, schema: Schema, schema_updates: List[TSchemaUpdate]) -> None: + def update_schema(self, schema: Schema, schema_updates: List[TSchemaUpdate]) -> None: for schema_update in schema_updates: for table_name, table_updates in schema_update.items(): logger.info( f"Updating schema for table {table_name} with {len(table_updates)} deltas" ) for partial_table in table_updates: - # merge columns - schema.update_table(partial_table) - - @staticmethod - def group_worker_files(files: Sequence[str], no_groups: int) -> List[Sequence[str]]: - # sort files so the same tables are in the same worker - files = list(sorted(files)) - - chunk_size = max(len(files) // no_groups, 1) - chunk_files = list(chunks(files, chunk_size)) - # distribute the remainder files to existing groups starting from the end - remainder_l = len(chunk_files) - no_groups - l_idx = 0 - while remainder_l > 0: - for idx, file in enumerate(reversed(chunk_files.pop())): - chunk_files[-l_idx - idx - remainder_l].append(file) # type: ignore - remainder_l -= 1 - l_idx = idx + 1 - return chunk_files + # merge columns where we expect identifiers to be normalized + schema.update_table(partial_table, normalize_identifiers=False) def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TWorkerRV: workers: int = getattr(self.pool, "_max_workers", 1) - chunk_files = self.group_worker_files(files, workers) + chunk_files = group_worker_files(files, workers) schema_dict: TStoredSchema = schema.to_dict() param_chunk = [ ( @@ -319,10 +107,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TW # return stats summary = TWorkerRV([], []) # push all tasks to queue - tasks = [ - (self.pool.submit(Normalize.w_normalize_files, *params), params) - for params in param_chunk - ] + tasks = [(self.pool.submit(w_normalize_files, *params), params) for params in param_chunk] while len(tasks) > 0: sleep(0.3) @@ -337,7 +122,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TW result: TWorkerRV = pending.result() try: # gather schema from all manifests, validate consistency and combine - self.update_table(schema, result[0]) + self.update_schema(schema, result[0]) summary.schema_updates.extend(result.schema_updates) summary.file_metrics.extend(result.file_metrics) # update metrics @@ -358,7 +143,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TW # TODO: it's time for a named tuple params = params[:3] + (schema_dict,) + params[4:] retry_pending: Future[TWorkerRV] = self.pool.submit( - Normalize.w_normalize_files, *params + w_normalize_files, *params ) tasks.append((retry_pending, params)) # remove finished tasks @@ -368,7 +153,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TW return summary def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TWorkerRV: - result = Normalize.w_normalize_files( + result = w_normalize_files( self.config, self.normalize_storage.config, self.load_storage.config, @@ -376,7 +161,7 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TWor load_id, files, ) - self.update_table(schema, result.schema_updates) + self.update_schema(schema, result.schema_updates) self.collector.update("Files", len(result.file_metrics)) self.collector.update( "Items", sum(result.file_metrics, EMPTY_DATA_WRITER_METRICS).items_count @@ -399,7 +184,7 @@ def spool_files( # update normalizer specific info for table_name in table_metrics: table = schema.tables[table_name] - x_normalizer = table.setdefault("x-normalizer", {}) # type: ignore[typeddict-item] + x_normalizer = table.setdefault("x-normalizer", {}) # drop evolve once for all tables that seen data x_normalizer.pop("evolve-columns-once", None) # mark that table have seen data only if there was data diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py new file mode 100644 index 0000000000..d5d4a028d9 --- /dev/null +++ b/dlt/normalize/worker.py @@ -0,0 +1,254 @@ +from typing import Callable, List, Dict, NamedTuple, Sequence, Set, Optional, Type + +from dlt.common import logger +from dlt.common.configuration.container import Container +from dlt.common.data_writers import ( + DataWriter, + DataWriterMetrics, + create_import_spec, + resolve_best_writer_spec, + get_best_writer_spec, + is_native_writer, +) +from dlt.common.utils import chunks +from dlt.common.schema.typing import TStoredSchema, TTableSchema +from dlt.common.storages import ( + NormalizeStorage, + LoadStorage, + LoadStorageConfiguration, + NormalizeStorageConfiguration, + ParsedLoadJobFileName, +) +from dlt.common.schema import TSchemaUpdate, Schema + +from dlt.normalize.configuration import NormalizeConfiguration +from dlt.normalize.exceptions import NormalizeJobFailed +from dlt.normalize.items_normalizers import ( + ArrowItemsNormalizer, + FileImportNormalizer, + JsonLItemsNormalizer, + ItemsNormalizer, +) + + +class TWorkerRV(NamedTuple): + schema_updates: List[TSchemaUpdate] + file_metrics: List[DataWriterMetrics] + + +def group_worker_files(files: Sequence[str], no_groups: int) -> List[Sequence[str]]: + # sort files so the same tables are in the same worker + files = list(sorted(files)) + + chunk_size = max(len(files) // no_groups, 1) + chunk_files = list(chunks(files, chunk_size)) + # distribute the remainder files to existing groups starting from the end + remainder_l = len(chunk_files) - no_groups + l_idx = 0 + while remainder_l > 0: + for idx, file in enumerate(reversed(chunk_files.pop())): + chunk_files[-l_idx - idx - remainder_l].append(file) # type: ignore + remainder_l -= 1 + l_idx = idx + 1 + return chunk_files + + +def w_normalize_files( + config: NormalizeConfiguration, + normalize_storage_config: NormalizeStorageConfiguration, + loader_storage_config: LoadStorageConfiguration, + stored_schema: TStoredSchema, + load_id: str, + extracted_items_files: Sequence[str], +) -> TWorkerRV: + destination_caps = config.destination_capabilities + schema_updates: List[TSchemaUpdate] = [] + # normalizers are cached per table name + item_normalizers: Dict[str, ItemsNormalizer] = {} + + preferred_file_format = ( + destination_caps.preferred_loader_file_format + or destination_caps.preferred_staging_file_format + ) + # TODO: capabilities.supported_*_formats can be None, it should have defaults + supported_file_formats = destination_caps.supported_loader_file_formats or [] + supported_table_formats = destination_caps.supported_table_formats or [] + + # process all files with data items and write to buffered item storage + with Container().injectable_context(destination_caps): + schema = Schema.from_stored_schema(stored_schema) + normalize_storage = NormalizeStorage(False, normalize_storage_config) + load_storage = LoadStorage(False, supported_file_formats, loader_storage_config) + + def _get_items_normalizer( + parsed_file_name: ParsedLoadJobFileName, table_schema: TTableSchema + ) -> ItemsNormalizer: + item_format = DataWriter.item_format_from_file_extension(parsed_file_name.file_format) + + table_name = table_schema["name"] + if table_name in item_normalizers: + return item_normalizers[table_name] + + if ( + "table_format" in table_schema + and table_schema["table_format"] not in supported_table_formats + ): + logger.warning( + "Destination does not support the configured `table_format` value " + f"`{table_schema['table_format']}` for table `{table_schema['name']}`. " + "The setting will probably be ignored." + ) + + items_preferred_file_format = preferred_file_format + items_supported_file_formats = supported_file_formats + if destination_caps.loader_file_format_adapter is not None: + items_preferred_file_format, items_supported_file_formats = ( + destination_caps.loader_file_format_adapter( + preferred_file_format, + ( + supported_file_formats.copy() + if isinstance(supported_file_formats, list) + else supported_file_formats + ), + table_schema=table_schema, + ) + ) + + best_writer_spec = None + if item_format == "file": + # if we want to import file, create a spec that may be used only for importing + best_writer_spec = create_import_spec( + parsed_file_name.file_format, items_supported_file_formats # type: ignore[arg-type] + ) + + config_loader_file_format = config.loader_file_format + if file_format := table_schema.get("file_format"): + # resource has a file format defined so use it + if file_format == "preferred": + # use destination preferred + config_loader_file_format = items_preferred_file_format + else: + # use resource format + config_loader_file_format = file_format + logger.info( + f"A file format for table {table_name} was specified to {file_format} in the" + f" resource so {config_loader_file_format} format being used." + ) + + if config_loader_file_format and best_writer_spec is None: + # force file format + if config_loader_file_format in items_supported_file_formats: + # TODO: pass supported_file_formats, when used in pipeline we already checked that + # but if normalize is used standalone `supported_loader_file_formats` may be unresolved + best_writer_spec = get_best_writer_spec(item_format, config_loader_file_format) + else: + logger.warning( + f"The configured value `{config_loader_file_format}` " + "for `loader_file_format` is not supported for table " + f"`{table_name}` and will be ignored. Dlt " + "will use a supported format instead." + ) + + if best_writer_spec is None: + # find best spec among possible formats taking into account destination preference + best_writer_spec = resolve_best_writer_spec( + item_format, items_supported_file_formats, items_preferred_file_format + ) + # if best_writer_spec.file_format != preferred_file_format: + # logger.warning( + # f"For data items yielded as {item_format} jobs in file format" + # f" {preferred_file_format} cannot be created." + # f" {best_writer_spec.file_format} jobs will be used instead." + # " This may decrease the performance." + # ) + item_storage = load_storage.create_item_storage(best_writer_spec) + if not is_native_writer(item_storage.writer_cls): + logger.warning( + f"For data items yielded as {item_format} and job file format" + f" {best_writer_spec.file_format} native writer could not be found. A" + f" {item_storage.writer_cls.__name__} writer is used that internally" + f" converts {item_format}. This will degrade performance." + ) + cls: Type[ItemsNormalizer] + if item_format == "arrow": + cls = ArrowItemsNormalizer + elif item_format == "object": + cls = JsonLItemsNormalizer + else: + cls = FileImportNormalizer + logger.info( + f"Created items normalizer {cls.__name__} with writer" + f" {item_storage.writer_cls.__name__} for item format {item_format} and file" + f" format {item_storage.writer_spec.file_format}" + ) + norm = item_normalizers[table_name] = cls( + item_storage, + normalize_storage, + schema, + load_id, + config, + ) + return norm + + def _gather_metrics_and_close( + parsed_fn: ParsedLoadJobFileName, in_exception: bool + ) -> List[DataWriterMetrics]: + writer_metrics: List[DataWriterMetrics] = [] + try: + try: + for normalizer in item_normalizers.values(): + normalizer.item_storage.close_writers(load_id, skip_flush=in_exception) + except Exception: + # if we had exception during flushing the writers, close them without flushing + if not in_exception: + for normalizer in item_normalizers.values(): + normalizer.item_storage.close_writers(load_id, skip_flush=True) + raise + finally: + # always gather metrics + for normalizer in item_normalizers.values(): + norm_metrics = normalizer.item_storage.closed_files(load_id) + writer_metrics.extend(norm_metrics) + for normalizer in item_normalizers.values(): + normalizer.item_storage.remove_closed_files(load_id) + except Exception as exc: + if in_exception: + # swallow exception if we already handle exceptions + return writer_metrics + else: + # enclose the exception during the closing in job failed exception + job_id = parsed_fn.job_id() if parsed_fn else "" + raise NormalizeJobFailed(load_id, job_id, str(exc), writer_metrics) + return writer_metrics + + parsed_file_name: ParsedLoadJobFileName = None + try: + root_tables: Set[str] = set() + for extracted_items_file in extracted_items_files: + parsed_file_name = ParsedLoadJobFileName.parse(extracted_items_file) + # normalize table name in case the normalization changed + # NOTE: this is the best we can do, until a full lineage information is in the schema + root_table_name = schema.naming.normalize_table_identifier( + parsed_file_name.table_name + ) + root_tables.add(root_table_name) + normalizer = _get_items_normalizer( + parsed_file_name, + stored_schema["tables"].get(root_table_name, {"name": root_table_name}), + ) + logger.debug( + f"Processing extracted items in {extracted_items_file} in load_id" + f" {load_id} with table name {root_table_name} and schema {schema.name}" + ) + partial_updates = normalizer(extracted_items_file, root_table_name) + schema_updates.extend(partial_updates) + logger.debug(f"Processed file {extracted_items_file}") + except Exception as exc: + job_id = parsed_file_name.job_id() if parsed_file_name else "" + writer_metrics = _gather_metrics_and_close(parsed_file_name, in_exception=True) + raise NormalizeJobFailed(load_id, job_id, str(exc), writer_metrics) from exc + else: + writer_metrics = _gather_metrics_and_close(parsed_file_name, in_exception=False) + + logger.info(f"Processed all items in {len(extracted_items_files)} files") + return TWorkerRV(schema_updates, writer_metrics) diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 20ba0b07d0..4efc7716e6 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -173,31 +173,39 @@ def attach( pipeline_name: str = None, pipelines_dir: str = None, pipeline_salt: TSecretValue = None, - full_refresh: Optional[bool] = None, - dev_mode: bool = False, - credentials: Any = None, + destination: TDestinationReferenceArg = None, + staging: TDestinationReferenceArg = None, progress: TCollectorArg = _NULL_COLLECTOR, **injection_kwargs: Any, ) -> Pipeline: - """Attaches to the working folder of `pipeline_name` in `pipelines_dir` or in default directory. Requires that valid pipeline state exists in working folder.""" + """Attaches to the working folder of `pipeline_name` in `pipelines_dir` or in default directory. Requires that valid pipeline state exists in working folder. + Pre-configured `destination` and `staging` factories may be provided. If not present, default factories are created from pipeline state. + """ ensure_correct_pipeline_kwargs(attach, **injection_kwargs) - full_refresh_argument_deprecated("attach", full_refresh) # if working_dir not provided use temp folder if not pipelines_dir: pipelines_dir = get_dlt_pipelines_dir() progress = collector_from_name(progress) + destination = Destination.from_reference( + destination or injection_kwargs["destination_type"], + destination_name=injection_kwargs["destination_name"], + ) + staging = Destination.from_reference( + staging or injection_kwargs.get("staging_type", None), + destination_name=injection_kwargs.get("staging_name", None), + ) # create new pipeline instance p = Pipeline( pipeline_name, pipelines_dir, pipeline_salt, + destination, + staging, None, None, None, - credentials, - None, None, - full_refresh if full_refresh is not None else dev_mode, + False, # always False as dev_mode so we do not wipe the working folder progress, True, last_config(**injection_kwargs), diff --git a/dlt/pipeline/dbt.py b/dlt/pipeline/dbt.py index ee900005fd..0b6ec5f896 100644 --- a/dlt/pipeline/dbt.py +++ b/dlt/pipeline/dbt.py @@ -38,7 +38,7 @@ def get_venv( # keep venv inside pipeline if path is relative if not os.path.isabs(venv_path): pipeline._pipeline_storage.create_folder(venv_path, exists_ok=True) - venv_dir = pipeline._pipeline_storage.make_full_path(venv_path) + venv_dir = pipeline._pipeline_storage.make_full_path_safe(venv_path) else: venv_dir = venv_path # try to restore existing venv diff --git a/dlt/pipeline/mark.py b/dlt/pipeline/mark.py index 3956d9bbe2..5f3122e7a5 100644 --- a/dlt/pipeline/mark.py +++ b/dlt/pipeline/mark.py @@ -2,6 +2,7 @@ from dlt.extract import ( with_table_name, with_hints, + with_file_import, make_hints, materialize_schema_item as materialize_table_schema, ) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 392b195ff2..11f8d6223e 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1,6 +1,5 @@ import contextlib import os -import datetime # noqa: 251 from contextlib import contextmanager from functools import wraps from typing import ( @@ -38,7 +37,6 @@ DestinationUndefinedEntity, ) from dlt.common.exceptions import MissingDependencyException -from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime from dlt.common.schema.typing import ( TColumnNames, @@ -84,12 +82,12 @@ DestinationClientStagingConfiguration, DestinationClientDwhWithStagingConfiguration, ) +from dlt.common.normalizers.naming import NamingConvention from dlt.common.pipeline import ( ExtractInfo, LoadInfo, NormalizeInfo, PipelineContext, - StepInfo, TStepInfo, SupportsPipeline, TPipelineLocalState, @@ -104,7 +102,7 @@ from dlt.common.warnings import deprecated, Dlt04DeprecationWarning from dlt.common.versioned_state import json_encode_state, json_decode_state -from dlt.extract import DltSource, DltResource +from dlt.extract import DltSource from dlt.extract.exceptions import SourceExhausted from dlt.extract.extract import Extract, data_to_sources from dlt.normalize import Normalize @@ -125,7 +123,6 @@ PipelineStepFailed, SqlClientNotAvailable, FSClientNotAvailable, - PipelineNeverRan, ) from dlt.pipeline.trace import ( PipelineTrace, @@ -360,14 +357,14 @@ def __init__( self._init_working_dir(pipeline_name, pipelines_dir) with self.managed_state() as state: + self.credentials = credentials + self._configure(import_schema_path, export_schema_path, must_attach_to_local_pipeline) # changing the destination could be dangerous if pipeline has pending load packages - self._set_destinations(destination=destination, staging=staging) + self._set_destinations(destination=destination, staging=staging, initializing=True) # set the pipeline properties from state, destination and staging will not be set self._state_to_props(state) # we overwrite the state with the values from init self._set_dataset_name(dataset_name) - self.credentials = credentials - self._configure(import_schema_path, export_schema_path, must_attach_to_local_pipeline) def drop(self, pipeline_name: str = None) -> "Pipeline": """Deletes local pipeline state, schemas and any working files. @@ -448,14 +445,13 @@ def extract( refresh=refresh or self.refresh, ) # extract state - state: TPipelineStateDoc = None if self.config.restore_from_destination: # this will update state version hash so it will not be extracted again by with_state_sync - state = self._bump_version_and_extract_state( + self._bump_version_and_extract_state( self._container[StateInjectableContext].state, True, extract_step ) # commit load packages with state - extract_step.commit_packages(state) + extract_step.commit_packages() return self._get_step_info(extract_step) except Exception as exc: # emit step info @@ -867,6 +863,11 @@ def state(self) -> TPipelineState: """Returns a dictionary with the pipeline state""" return self._get_state() + @property + def naming(self) -> NamingConvention: + """Returns naming convention of the default schema""" + return self._get_schema_or_create().naming + @property def last_trace(self) -> PipelineTrace: """Returns or loads last trace generated by pipeline. The trace is loaded from standard location.""" @@ -1142,10 +1143,6 @@ def _extract_source( source, max_parallel_items, workers, load_package_state_update=load_package_state_update ) - # save import with fully discovered schema - # NOTE: moved to with_schema_sync, remove this if all test pass - # self._schema_storage.save_import_schema_if_not_exists(source.schema) - # update live schema but not update the store yet source.schema = self._schema_storage.set_live_schema(source.schema) @@ -1253,10 +1250,28 @@ def _get_destination_capabilities(self) -> DestinationCapabilitiesContext: "Please provide `destination` argument to `pipeline`, `run` or `load` method" " directly or via .dlt config.toml file or environment variable.", ) - return self.destination.capabilities() + # check if default schema is present + if ( + self.default_schema_name is not None + and self.default_schema_name in self._schema_storage + ): + naming = self.default_schema.naming + else: + naming = None + return self.destination.capabilities(naming=naming) def _get_staging_capabilities(self) -> Optional[DestinationCapabilitiesContext]: - return self.staging.capabilities() if self.staging is not None else None + if self.staging is None: + return None + # check if default schema is present + if ( + self.default_schema_name is not None + and self.default_schema_name in self._schema_storage + ): + naming = self.default_schema.naming + else: + naming = None + return self.staging.capabilities(naming=naming) def _validate_pipeline_name(self) -> None: try: @@ -1292,9 +1307,11 @@ def _set_destinations( destination_name: Optional[str] = None, staging: Optional[TDestinationReferenceArg] = None, staging_name: Optional[str] = None, + initializing: bool = False, ) -> None: - # destination_mod = DestinationReference.from_name(destination) - if destination: + destination_changed = destination is not None and destination != self.destination + # set destination if provided but do not swap if factory is the same + if destination_changed: self.destination = Destination.from_reference( destination, destination_name=destination_name ) @@ -1313,7 +1330,8 @@ def _set_destinations( staging = "filesystem" staging_name = "filesystem" - if staging: + staging_changed = staging is not None and staging != self.staging + if staging_changed: staging_module = Destination.from_reference(staging, destination_name=staging_name) if staging_module and not issubclass( staging_module.spec, DestinationClientStagingConfiguration @@ -1321,9 +1339,16 @@ def _set_destinations( raise DestinationNoStagingMode(staging_module.destination_name) self.staging = staging_module - with self._maybe_destination_capabilities(): - # default normalizers must match the destination - self._set_default_normalizers() + if staging_changed or destination_changed: + # make sure that capabilities can be generated + with self._maybe_destination_capabilities(): + # update normalizers in all live schemas, only when destination changed + if destination_changed and not initializing: + for schema in self._schema_storage.live_schemas.values(): + schema.update_normalizers() + # set new context + if not initializing: + self._set_context(is_active=True) @contextmanager def _maybe_destination_capabilities( @@ -1351,9 +1376,6 @@ def _maybe_destination_capabilities( if injected_caps: injected_caps.__exit__(None, None, None) - def _set_default_normalizers(self) -> None: - _, self._default_naming, _ = import_normalizers(explicit_normalizers()) - def _set_dataset_name(self, new_dataset_name: str) -> None: if not new_dataset_name and not self.dataset_name: # dataset name is required but not provided - generate the default now @@ -1600,7 +1622,7 @@ def _bump_version_and_extract_state( extract: Extract = None, load_package_state_update: Optional[Dict[str, Any]] = None, schema: Optional[Schema] = None, - ) -> TPipelineStateDoc: + ) -> None: """Merges existing state into `state` and extracts state using `storage` if extract_state is True. Storage will be created on demand. In that case the extracted package will be immediately committed. @@ -1608,13 +1630,24 @@ def _bump_version_and_extract_state( _, hash_, _ = bump_pipeline_state_version_if_modified(self._props_to_state(state)) should_extract = hash_ != state["_local"].get("_last_extracted_hash") if should_extract and extract_state: - data, doc = state_resource(state) - extract_ = extract or Extract( - self._schema_storage, self._normalize_storage_config(), original_data=data + extract_ = extract or Extract(self._schema_storage, self._normalize_storage_config()) + # create or get load package upfront to get load_id to create state doc + schema = schema or self.default_schema + # note that we preferably retrieve existing package for `schema` + # same thing happens in extract_.extract so the load_id is preserved + load_id = extract_.extract_storage.create_load_package( + schema, reuse_exiting_package=True ) + data, doc = state_resource(state, load_id) + # keep the original data to be used in the metrics + if extract_.original_data is None: + extract_.original_data = data + # append pipeline state to package state + load_package_state_update = load_package_state_update or {} + load_package_state_update["pipeline_state"] = doc self._extract_source( extract_, - data_to_sources(data, self, schema or self.default_schema)[0], + data_to_sources(data, self, schema)[0], 1, 1, load_package_state_update=load_package_state_update, @@ -1623,9 +1656,7 @@ def _bump_version_and_extract_state( mark_state_extracted(state, hash_) # commit only if we created storage if not extract: - extract_.commit_packages(doc) - return doc - return None + extract_.commit_packages() def _list_schemas_sorted(self) -> List[str]: """Lists schema names sorted to have deterministic state""" diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index 41009f2909..11648328f2 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -4,7 +4,8 @@ import dlt from dlt.common.pendulum import pendulum from dlt.common.typing import DictStrAny -from dlt.common.schema.typing import STATE_TABLE_NAME, TTableSchemaColumns +from dlt.common.schema.typing import PIPELINE_STATE_TABLE_NAME +from dlt.common.schema.utils import pipeline_state_table from dlt.common.destination.reference import WithStateSync, Destination, StateInfo from dlt.common.versioned_state import ( generate_state_version_hash, @@ -24,20 +25,6 @@ PIPELINE_STATE_ENGINE_VERSION = 4 LOAD_PACKAGE_STATE_KEY = "pipeline_state" -# state table columns -STATE_TABLE_COLUMNS: TTableSchemaColumns = { - "version": {"name": "version", "data_type": "bigint", "nullable": False}, - "engine_version": {"name": "engine_version", "data_type": "bigint", "nullable": False}, - "pipeline_name": {"name": "pipeline_name", "data_type": "text", "nullable": False}, - "state": {"name": "state", "data_type": "text", "nullable": False}, - "created_at": {"name": "created_at", "data_type": "timestamp", "nullable": False}, - "version_hash": { - "name": "version_hash", - "data_type": "text", - "nullable": True, - }, # set to nullable so we can migrate existing tables -} - def generate_pipeline_state_version_hash(state: TPipelineState) -> str: return generate_state_version_hash(state, exclude_attrs=["_local"]) @@ -98,27 +85,28 @@ def state_doc(state: TPipelineState, load_id: str = None) -> TPipelineStateDoc: state = copy(state) state.pop("_local") state_str = compress_state(state) - doc: TPipelineStateDoc = { - "version": state["_state_version"], - "engine_version": state["_state_engine_version"], - "pipeline_name": state["pipeline_name"], - "state": state_str, - "created_at": pendulum.now(), - "version_hash": state["_version_hash"], - } - if load_id: - doc["dlt_load_id"] = load_id - return doc + info = StateInfo( + version=state["_state_version"], + engine_version=state["_state_engine_version"], + pipeline_name=state["pipeline_name"], + state=state_str, + created_at=pendulum.now(), + version_hash=state["_version_hash"], + _dlt_load_id=load_id, + ) + return info.as_doc() -def state_resource(state: TPipelineState) -> Tuple[DltResource, TPipelineStateDoc]: - doc = state_doc(state) +def state_resource(state: TPipelineState, load_id: str) -> Tuple[DltResource, TPipelineStateDoc]: + doc = state_doc(state, load_id) + state_table = pipeline_state_table() return ( dlt.resource( [doc], - name=STATE_TABLE_NAME, - write_disposition="append", - columns=STATE_TABLE_COLUMNS, + name=PIPELINE_STATE_TABLE_NAME, + write_disposition=state_table["write_disposition"], + file_format=state_table["file_format"], + columns=state_table["columns"], ), doc, ) diff --git a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py index 380912a9a7..ce4b2a12d0 100644 --- a/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py +++ b/docs/examples/custom_destination_bigquery/custom_destination_bigquery.py @@ -86,7 +86,7 @@ def bigquery_insert( pipeline_name="csv_to_bigquery_insert", destination=bigquery_insert, dataset_name="mydata", - full_refresh=True, + dev_mode=True, ) load_info = pipeline.run(resource(url=OWID_DISASTERS_URL)) diff --git a/docs/examples/custom_destination_lancedb/custom_destination_lancedb.py b/docs/examples/custom_destination_lancedb/custom_destination_lancedb.py index 9d75d90f99..ba815d4fcd 100644 --- a/docs/examples/custom_destination_lancedb/custom_destination_lancedb.py +++ b/docs/examples/custom_destination_lancedb/custom_destination_lancedb.py @@ -38,7 +38,9 @@ from dlt.sources.helpers.rest_client import RESTClient, AuthConfigBase # access secrets to get openai key and instantiate embedding function -openai_api_key: str = dlt.secrets.get("destination.lancedb.credentials.embedding_model_provider_api_key") +openai_api_key: str = dlt.secrets.get( + "destination.lancedb.credentials.embedding_model_provider_api_key" +) func = get_registry().get("openai").create(name="text-embedding-3-small", api_key=openai_api_key) diff --git a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py index 809a6cfbd6..5fbba98a21 100644 --- a/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py +++ b/docs/examples/pdf_to_weaviate/pdf_to_weaviate.py @@ -25,7 +25,7 @@ import os import dlt -from dlt.destinations.impl.weaviate import weaviate_adapter +from dlt.destinations.adapters import weaviate_adapter from PyPDF2 import PdfReader diff --git a/docs/examples/postgres_to_postgres/postgres_to_postgres.py b/docs/examples/postgres_to_postgres/postgres_to_postgres.py index f5327ee236..848af53317 100644 --- a/docs/examples/postgres_to_postgres/postgres_to_postgres.py +++ b/docs/examples/postgres_to_postgres/postgres_to_postgres.py @@ -170,7 +170,7 @@ def table_desc(table_name, pk, schema_name, order_date, columns="*"): pipeline_name=pipeline_name, destination="duckdb", dataset_name=target_schema_name, - full_refresh=True, + dev_mode=True, progress="alive_progress", ) else: @@ -178,8 +178,8 @@ def table_desc(table_name, pk, schema_name, order_date, columns="*"): pipeline_name=pipeline_name, destination="postgres", dataset_name=target_schema_name, - full_refresh=False, - ) # full_refresh=False + dev_mode=False, + ) # dev_mode=False # start timer startTime = pendulum.now() diff --git a/docs/technical/README.md b/docs/technical/README.md deleted file mode 100644 index 6e2b5048a8..0000000000 --- a/docs/technical/README.md +++ /dev/null @@ -1,10 +0,0 @@ -## Finished documents - -1. [general_usage.md](general_usage.md) -2. [create_pipeline.md](create_pipeline.md) -3. [secrets_and_config.md](secrets_and_config.md) -4. [working_with_schemas.md](working_with_schemas.md) - -## In progress - -5. [customization_and_hacking.md](customization_and_hacking.md) diff --git a/docs/technical/create_pipeline.md b/docs/technical/create_pipeline.md deleted file mode 100644 index f6603d08b8..0000000000 --- a/docs/technical/create_pipeline.md +++ /dev/null @@ -1,441 +0,0 @@ -# Create Pipeline -marks features that are: - -⛔ not implemented, hard to add - -☮️ not implemented, easy to add - - -## Example from `dlt` module docstring -It is possible to create "intuitive" pipeline just by providing a list of objects to `dlt.run` methods No decorators and secret files, configurations are necessary. - -```python -import dlt -from dlt.sources.helpers import requests - -dlt.run( - requests.get("https://api.chess.com/pub/player/magnuscarlsen/games/2022/11").json()["games"], - destination="duckdb", - table_name="magnus_games" -) -``` - -Run your pipeline script -`$ python magnus_games.py` - -See and query your data with autogenerated Streamlit app -`$ dlt pipeline dlt_magnus_games show` - -## Source extractor function the preferred way -General guidelines: -1. the source extractor is a function decorated with `@dlt.source`. that function **yields** or **returns** a list of resources. -2. resources are generator functions that always **yield** data (enforced by exception which I hope is user friendly). Access to external endpoints, databases etc. should happen from that generator function. Generator functions may be decorated with `@dlt.resource` to provide alternative names, write disposition etc. -3. resource generator functions can be OFC parametrized and resources may be created dynamically -4. the resource generator function may yield **anything that is json serializable**. we prefer to yield _dict_ or list of dicts. -> yielding lists is much more efficient in terms of processing! -5. like any other iterator, the @dlt.source and @dlt.resource **can be iterated and thus extracted and loaded only once**, see example below. - -**Remarks:** - -1. the **@dlt.resource** let's you define the table schema hints: `name`, `write_disposition`, `columns` -2. the **@dlt.source** let's you define global schema props: `name` (which is also source name), `schema` which is Schema object if explicit schema is provided `nesting` to set nesting level etc. -3. decorators can also be used as functions ie in case of dlt.resource and `lazy_function` (see examples) - -```python -endpoints = ["songs", "playlist", "albums"] -# return list of resourced -return [dlt.resource(lazy_function(endpoint, name=endpoint) for endpoint in endpoints)] - -``` - -### Extracting data -Source function is not meant to extract the data, but in many cases getting some metadata ie. to generate dynamic resources (like in case of google sheets example) is unavoidable. The source function's body is evaluated **outside** the pipeline `run` (if `dlt.source` is a generator, it is immediately consumed). - -Actual extraction of the data should happen inside the `dlt.resource` which is lazily executed inside the `dlt` pipeline. - -> both a `dlt` source and resource are regular Python iterators and can be passed to any python function that accepts them ie to `list`. `dlt` will evaluate such iterators, also parallel and async ones and provide mock state to it. - -## Multiple resources and resource selection when loading -The source extraction function may contain multiple resources. The resources can be defined as multiple resource functions or created dynamically ie. with parametrized generators. -The user of the pipeline can check what resources are available and select the resources to load. - - -**each resource has a a separate resource function** -```python -from dlt.sources.helpers import requests -import dlt - -@dlt.source -def hubspot(...): - - @dlt.resource(write_disposition="replace") - def users(): - # calls to API happens here - ... - yield users - - @dlt.resource(write_disposition="append") - def transactions(): - ... - yield transactions - - # return a list of resources - return users, transactions - -# load all resources -taktile_data(1).run(destination=bigquery) -# load only decisions -taktile_data(1).with_resources("decisions").run(....) - -# alternative form: -source = taktile_data(1) -# select only decisions to be loaded -source.resources.select("decisions") -# see what is selected -print(source.selected_resources) -# same as this -print(source.resources.selected) -``` - -Except being accessible via `source.resources` dictionary, **every resource is available as an attribute of the source**. For the example above -```python -print(list(source.decisions)) # will iterate decisions resource -source.logs.selected = False # deselect resource -``` - -## Resources may be created dynamically -Here we implement a single parametrized function that **yields** data and we call it repeatedly. Mind that the function body won't be executed immediately, only later when generator is consumed in extract stage. - -```python - -@dlt.source -def spotify(): - - endpoints = ["songs", "playlists", "albums"] - - def get_resource(endpoint): - # here we yield the whole response - yield requests.get(url + "/" + endpoint).json() - - # here we yield resources because this produces cleaner code - for endpoint in endpoints: - # calling get_resource creates generator, the actual code of the function will be executed in extractor - yield dlt.resource(get_resource(endpoint), name=endpoint) - -``` - -## Unbound (parametrized) resources -Imagine the situation in which you have a resource for which you want (or require) user to pass some options ie. the number of records returned. - -> try it, it is ⚡ powerful - -1. In all examples above you do that via the source and returned resources are not parametrized. -OR -2. You can return a **parametrized (unbound)** resources from the source. - -```python - -@dlt.source -def chess(chess_api_url): - - # let people choose player title, the default is grand master - @dlt.resource - def players(title_filter="GM", max_results=10): - yield - - # ❗ return the players without the calling - return players - -s = chess("url") -# let's parametrize the resource to select masters. you simply call `bind` method on the resource to bind it -# if you do not bind it, the default values are used -s.players.bind("M", max_results=1000) -# load the masters -s.run() - -``` - -## A standalone @resource -A general purpose resource (ie. jsonl reader, generic sql query reader etc.) that you want to add to any of your sources or multiple instances of it to your pipelines? -Yeah definitely possible. Just replace `@source` with `@resource` decorator. - -```python -@dlt.resource(name="logs", write_disposition="append") -def taktile_data(initial_log_id, taktile_api_key=dlt.secret.value): - - # yes, this will also work but data will be obtained immediately when taktile_data() is called. - resp = requests.get( - "https://taktile.com/api/v2/logs?from_log_id=%i" % initial_log_id, - headers={"Authorization": taktile_api_key}) - resp.raise_for_status() - for item in resp.json()["result"]: - yield item - -# this will load the resource into default schema. see `general_usage.md) -dlt.run(source=taktile_data(1), destination=bigquery) - -``` -How standalone resource works: -1. It can be used like a source that contains only one resource (ie. single endpoint) -2. The main difference is that when extracted it will join the default schema in the pipeline (or explicitly passed schema) -3. It can be called from a `@source` function and then it becomes a resource of that source and joins the source schema - -## `dlt` state availability - -The state is a python dictionary-like object that is available within the `@dlt.source` and `@dlt.resource` decorated functions and may be read and written to. -The data within the state is loaded into destination together with any other extracted data and made automatically available to the source/resource extractor functions when they are run next time. -When using the state: -* Any JSON-serializable values can be written and the read from the state. -* The state available in the `dlt source` is read only and any changes will be discarded. Still it may be used to initialize the resources. -* The state available in the `dlt resource` is writable and written values will be available only once - -### State sharing and isolation across sources - -1. Each source and resources **in the same Python module** (no matter if they are standalone, inner or created dynamically) share the same state dictionary and is separated from other sources -2. Source accepts `section` argument which creates a separate state for that resource (and separate configuration as well). All sources with the same `section` share the state. -2. All the standalone resources and generators that do not belong to any source share the same state when being extracted (they are extracted withing ad-hoc created source) - -## Stream resources: dispatching data to several tables from single resources -What about resource like rasa tracker or singer tap that send a stream of events that should be routed to different tables? we have an answer (actually two): -1. in many cases the table name is based on the data item content (ie. you dispatch events of given type to different tables by event type). We can pass a function that takes the data item as input and returns table name. -```python -# send item to a table with name item["type"] -@dlt.resource(table_name=lambda i: i['type']) -def repo_events() -> Iterator[TDataItems]: - yield item -``` - -2. You can mark the yielded data with a table name (`dlt.mark.with_table_name`). This gives you full control on the name of the table - -see [here](docs/examples/sources/rasa/rasa.py) and [here](docs/examples/sources/singer_tap.py). - -## Source / resource config sections and arguments injection -You should read [secrets_and_config](secrets_and_config.md) now to understand how configs and credentials are passed to the decorated functions and how the users of them can configure their projects. - -Also look at the following [test](/tests/extract/test_decorators.py) : `test_source_sections` - -## Example sources and resources - -### With inner resource function -Resource functions can be placed inside the source extractor function. That lets them get access to source function input arguments and all the computations within the source function via so called closure. - -```python -from dlt.sources.helpers import requests -import dlt - -# the `dlt.source` tell the library that the decorated function is a source -# it will use function name `taktile_data` to name the source and the generated schema by default -# in general `@source` should **return** a list of resources or list of generators (function that yield data) -# @source may also **yield** resources or generators - if yielding is more convenient -# if @source returns or yields data - this will generate exception with a proper explanation. dlt user can always load the data directly without any decorators like in the previous example! -@dlt.source -def taktile_data(initial_log_id, taktile_api_key=dlt.secret.value): - - # the `dlt.resource` tells the `dlt.source` that the function defines a resource - # will use function name `logs` as resource/table name by default - # the function should **yield** the data items one by one or **yield** a list. - # here the decorator is optional: there are no parameters to `dlt.resource` - @dlt.resource - def logs(): - resp = requests.get( - "https://taktile.com/api/v2/logs?from_log_id=%i" % initial_log_id, - headers={"Authorization": taktile_api_key}) - resp.raise_for_status() - # option 1: yield the whole list - yield resp.json()["result"] - # or -> this is useful if you deal with a stream of data and for that you need an API that supports that, for example you could yield lists containing paginated results - for item in resp.json()["result"]: - yield item - - # as mentioned we return a resource or a list of resources - return logs - # this will also work - # return logs() -``` - -### With outer generator yielding data, and @resource created dynamically -```python - -def taktile_logs_data(initial_log_id, taktile_api_key=dlt.secret.value) - yield data - - -@dlt.source -def taktile_data(initial_log_id, taktile_api_key): - # pass the arguments and convert to resource - return dlt.resource(taktile_logs_data(initial_log_id, taktile_api_key), name="logs", write_disposition="append") -``` - -### A source with resources defined elsewhere -Example of the above -```python -from taktile.resources import logs - -@dlt.source -def taktile_data(initial_log_id, taktile_api_key=dlt.secret.value): - return logs(initial_log_id, taktile_api_key) -``` - -## Advanced Topics - -### Transformers ⚡ -This happens all the time: -1. We have an endpoint that returns a list of users and then we must get each profile with a separate call. -2. The situation above is getting even more complicated when we need that list in two places in our source ie. we want to get the profiles but also a list of transactions per user. - -Ideally we would obtain the list only once and then call and yield from the profiles and transactions endpoint in parallel so the extraction time is minimized. - -Here's example how to do that: [run resources and transformers in parallel threads](/docs/examples/chess/chess.py) and test named `test_evolve_schema` - -More on transformers: -1. you can have unbound (parametrized) transformers as well -2. you can use pipe '|' operator to pipe data from resources to transformers instead of binding them statically with `data_from`. -> see our [singer tap](/docs/examples/singer_tap_jsonl_example.py) example where we pipe a stream of document from `jsonl` into `raw_singer_tap` which is a standalone, unbound ⚡ transformer. -3. If transformer yields just one element you can `return` it instead. This allows you to apply the `retry` and `defer` (parallel execution) decorators directly to it. - -#### Transformer example - -Here we have a list of huge documents and we want to load into several tables. - -```python -@dlt.source -def spotify(): - - # deselect by default, we do not want to load the huge doc - @dlt.resource(selected=False) - def get_huge_doc(): - return requests.get(...) - - # make songs and playlists to be dependent on get_huge_doc - @dlt.transformer(data_from=get_huge_doc) - def songs(huge_doc): - yield huge_doc["songs"] - - @dlt.transformer(data_from=get_huge_doc) - def playlists(huge_doc): - yield huge_doc["playlists"] - - # as you can see the get_huge_doc is not even returned, nevertheless it will be evaluated (only once) - # the huge doc will not be extracted and loaded - return songs, playlists - # we could also use the pipe operator, intead of providing_data from - # return get_huge_doc | songs, get_huge_doc | playlists -``` - -## Data item transformations - -You can attach any number of transformations to your resource that are evaluated on item per item basis. The available transformation types: -* map - transform the data item -* filter - filter the data item -* yield map - a map that returns iterator (so single row may generate many rows) - -You can add and insert transformations on the `DltResource` object (ie. decorated function) -* resource.add_map -* resource.add_filter -* resource.add_yield_map - -> Transformations always deal with single items even if you return lists. - -You can add transformations to a resource (also within a source) **after it is created**. This allows to customize existing pipelines. The transformations may -be distributed with the pipeline or written ad hoc in pipeline script. -```python -# anonymize creates nice deterministic hash for any hashable data type (not implemented yet:) -from dlt.helpers import anonymize - -# example transformation provided by the user -def anonymize_user(user_data): - user_data["user_id"] = anonymize(user_data["user_id"]) - user_data["user_email"] = anonymize(user_data["user_email"]) - return user_data - -@dlt.source -def pipedrive(...): - ... - - @dlt.resource(write_disposition="replace") - def users(): - ... - users = requests.get(...) - ... - yield users - - return users, deals, customers -``` - -in pipeline script: -1. we want to remove user with id == "me" -2. we want to anonymize user data -3. we want to pivot `user_props` into KV table - -```python -from pipedrive import pipedrive, anonymize_user - -source = pipedrive() -# access resource in the source by name and add filter and map transformation -source.users.add_filter(lambda user: user["user_id"] != "me").add_map(anonymize_user) -# now we want to yield user props to separate table. we define our own generator function -def pivot_props(user): - # keep user - yield user - # yield user props to user_props table - yield from [ - dlt.mark.with_table_name({"user_id": user["user_id"], "name": k, "value": v}, "user_props") for k, v in user["props"] - ] - -source.user.add_yield_map(pivot_props) -pipeline.run(source) -``` - -We provide a library of various concrete transformations: - -* ☮️ a recursive versions of the map, filter and flat map which can be applied to any nesting level of the data item (the standard transformations work on recursion level 0). Possible applications - - ☮️ recursive rename of dict keys - - ☮️ converting all values to strings - - etc. - -## Some CS Theory - -### The power of decorators - -With decorators dlt can inspect and modify the code being decorated. -1. it knows what are the sources and resources without running them -2. it knows input arguments so it knows the config values and secret values (see `secrets_and_config`). with those we can generate deployments automatically -3. it can inject config and secret values automatically -4. it wraps the functions into objects that provide additional functionalities -- sources and resources are iterators so you can write -```python -items = list(source()) - -for item in source()["logs"]: - ... -``` -- you can select which resources to load with `source().select(*names)` -- you can add mappings and filters to resources - -### The power of yielding: The preferred way to write resources - -The Python function that yields is not a function but magical object that `dlt` can control: -1. it is not executed when you call it! the call just creates a generator (see below). in the example above `taktile_data(1)` will not execute the code inside, it will just return an object composed of function code and input parameters. dlt has control over the object and can execute the code later. this is called `lazy execution` -2. i can control when and how much of the code is executed. the function that yields typically looks like that - -```python -def lazy_function(endpoint_name): - # INIT - this will be executed only once when dlt wants! - get_configuration() - from_item = dlt.current.state.get("last_item", 0) - l = get_item_list_from_api(api_key, endpoint_name) - - # ITERATOR - this will be executed many times also when dlt wants more data! - for item in l: - yield requests.get(url, api_key, "%s?id=%s" % (endpoint_name, item["id"])).json() - # CLEANUP - # this will be executed only once after the last item was yielded! - dlt.current.state["last_item"] = item["id"] -``` - -3. dlt will execute this generator in extractor. the whole execution is atomic (including writing to state). if anything fails with exception the whole extract function fails. -4. the execution can be parallelized by using a decorator or a simple modifier function ie: -```python -for item in l: - yield deferred(requests.get(url, api_key, "%s?id=%s" % (endpoint_name, item["id"])).json()) -``` \ No newline at end of file diff --git a/docs/technical/general_usage.md b/docs/technical/general_usage.md index 19c93bcf38..336c892c66 100644 --- a/docs/technical/general_usage.md +++ b/docs/technical/general_usage.md @@ -90,7 +90,7 @@ p.extract([label1, label2, label3], name="labels") # will use default schema "s **By default, one dataset can handle multiple schemas**. The pipeline configuration option `use_single_dataset` controls the dataset layout in the destination. By default it is set to True. In that case only one dataset is created at the destination - by default dataset name which is the same as pipeline name. The dataset name can also be explicitly provided into `dlt.pipeline` `dlt.run` and `Pipeline::load` methods. -All the tables from all the schemas are stored in that dataset. The table names are **not prefixed** with schema names!. If there are any name clashes, tables in the destination will be unions of the fields of all the tables with same name in the schemas. +All the tables from all the schemas are stored in that dataset. The table names are **not prefixed** with schema names!. If there are any name collisions, tables in the destination will be unions of the fields of all the tables with same name in the schemas. **Enabling one dataset per schema layout** If you set `use_single_dataset` to False: @@ -181,44 +181,6 @@ The `run`, `extract`, `normalize` and `load` method raise `PipelineStepFailed` w > should we add it? I have a runner in `dlt` that would be easy to modify -## the `Pipeline` object -There are many ways to create or get current pipeline object. -```python - -# create and get default pipeline -p1 = dlt.pipeline() -# create explicitly configured pipeline -p2 = dlt.pipeline(pipeline_name="pipe", destination=bigquery) -# get recently created pipeline -assert dlt.pipeline() is p2 -# load data with recently created pipeline -assert dlt.run(taktile_data()) is p2 -assert taktile_data().run() is p2 - -``` - -The `Pipeline` object provides following functionalities: -1. `run`, `extract`, `normalize` and `load` methods -2. a `pipeline.schema` dictionary-like object to enumerate and get the schemas in pipeline -3. schema get with `pipeline.schemas[name]` is a live object: any modification to it is automatically applied to the pipeline with the next `run`, `load` etc. see [working_with_schemas.md](working_with_schemas.md) -4. it returns `sql_client` and `native_client` to get direct access to the destination (if destination supports SQL - currently all of them do) -5. it has several methods to inspect the pipeline state and I think those should be exposed via `dlt pipeline` CLI - -for example: -- list the extracted files if any -- list the load packages ready to load -- list the failed jobs in package -- show info on destination: what are the datasets, the current load_id, the current schema etc. - - -## Examples -[we have some here](/examples/) - -## command line interface - - -## logging -I need your input for user friendly logging. What should we log? What is important to see? ## pipeline runtime setup diff --git a/docs/technical/working_with_schemas.md b/docs/technical/working_with_schemas.md index d94edb8727..532f0e5a1d 100644 --- a/docs/technical/working_with_schemas.md +++ b/docs/technical/working_with_schemas.md @@ -1,134 +1,7 @@ -## General approach to define schemas -marks features that are: - -⛔ not implemented, hard to add - -☮️ not implemented, easy to add - -## Schema components - -### Schema content hash and version -Each schema file contains content based hash `version_hash` that is used to -1. detect manual changes to schema (ie. user edits content) -2. detect if the destination database schema is synchronized with the file schema - -Each time the schema is saved, the version hash is updated. - -Each schema contains also numeric version which increases automatically whenever schema is updated and saved. This version is mostly for informative purposes and there are cases where the increasing order will be lost. - -> Schema in the database is only updated if its hash is not stored in `_dlt_versions` table. In principle many pipelines may send data to a single dataset. If table name clash then a single table with the union of the columns will be created. If columns clash and they have different types etc. then the load will fail. - -### ❗ Normalizer and naming convention - -The parent table is created from all top level fields, if field are dictionaries they will be flattened. **all the key names will be converted with the configured naming convention**. The current naming convention -1. converts to snake_case, small caps. removes all ascii characters except alphanum and underscore -2. add `_` if name starts with number -3. multiples of `_` are converted into single `_` -4. the parent-child relation is expressed as double `_` in names. - -The nested lists will be converted into child tables. - -The data normalizer and the naming convention are part of the schema configuration. In principle the source can set own naming convention or json unpacking mechanism. Or user can overwrite those in `config.toml` - -> The table and column names are mapped automatically. **you cannot rename the columns or tables by changing the `name` property - you must rename your source documents** - -> if you provide any schema elements that contain identifiers via decorators or arguments (ie. `table_name` or `columns`) all the names used will be converted via the naming convention when adding to the schema. For example if you execute `dlt.run(... table_name="CamelCase")` the data will be loaded into `camel_case` - -> 💡 use simple, short small caps identifiers for everything! - -☠️ not implemented! - -⛔ The schema holds lineage information (from json paths to tables/columns) and (1) automatically adapts to destination limits ie. postgres 64 chars by recomputing all names (2) let's user to change the naming convention ie. to verbatim naming convention of `duckdb` where everything is allowed as identifier. - -⛔ Any naming convention generates name clashes. `dlt` detects and fixes name clashes using lineage information - - -#### JSON normalizer settings -Yes those are part of the normalizer module and can be plugged in. -1. column propagation from parent to child tables -2. nesting level - -```yaml -normalizers: - names: dlt.common.normalizers.names.snake_case - json: - module: dlt.common.normalizers.json.relational - config: - max_nesting: 5 - propagation: - # for all root tables - root: - # propagate root dlt id - _dlt_id: _dlt_root_id - tables: - # for particular tables - blocks: - # propagate timestamp as block_timestamp to child tables - timestamp: block_timestamp - hash: block_hash -``` - -## Data types -"text", "double", "bool", "timestamp", "bigint", "binary", "complex", "decimal", "wei" -⛔ you cannot specify scale and precision for bigint, binary, text and decimal - -☮️ there's no time and date type - -wei is a datatype that tries to best represent native Ethereum 256bit integers and fixed point decimals. it works correcly on postgres and bigquery ## Schema settings The `settings` section of schema let's you define various global rules that impact how tables and columns are inferred from data. -> 💡 it is the best practice to use those instead of providing the exact column schemas via `columns` argument or by pasting them in `yaml`. Any ideas for improvements? tell me. - -### Column hint rules -You can define a global rules that will apply hints to a newly inferred columns. Those rules apply to normalized column names. You can use column names directly or with regular expressions. ❗ when lineages are implemented the regular expressions will apply to lineages not to column names. - -Example from ethereum schema -```yaml -settings: - default_hints: - foreign_key: - - _dlt_parent_id - not_null: - - re:^_dlt_id$ - - _dlt_root_id - - _dlt_parent_id - - _dlt_list_idx - unique: - - _dlt_id - cluster: - - block_hash - partition: - - block_timestamp -``` - -### Preferred data types -You can define rules that will set the data type for newly created columns. Put the rules under `preferred_types` key of `settings`. On the left side there's a rule on a column name, on the right side is the data type. ❗See the column hint rules for naming convention! - -Example: -```yaml -settings: - preferred_types: - timestamp: timestamp - re:^inserted_at$: timestamp - re:^created_at$: timestamp - re:^updated_at$: timestamp - re:^_dlt_list_idx$: bigint -``` - -### data type autodetectors -You can define a set of functions that will be used to infer the data type of the column from a value. The functions are run from top to bottom on the lists. Look in `detections.py` to see what is available. -```yaml -settings: - detections: - - timestamp - - iso_timestamp - - iso_date -``` - -⛔ we may define `all_text` function that will generate string only schemas by telling `dlt` that all types should be coerced to strings. - ### Table exclude and include filters You can define the include and exclude filters on tables but you are much better off transforming and filtering your source data in python. The current implementation is both weird and quite powerful. In essence you can exclude columns and whole tables with regular expressions to which the inputs are normalized lineages of the values. Example @@ -191,54 +64,3 @@ p.run() ``` > The `normalize` stage creates standalone load packages each containing data and schema with particular version. Those packages are of course not impacted by the "live" schema changes. - -## Attaching schemas to sources -The general approach when creating a new pipeline is to setup a few global schema settings and then let the table and column schemas to be generated from the resource hints and data itself. - -> ⛔ I do not have any cool "schema builder" api yet to see the global settings. - -The `dlt.source` decorator accepts a schema instance that you can create yourself and whatever you want. It also support a few typical use cases: - -### Schema created implicitly by decorator -If no schema instance is passed, the decorator creates a schema with the name set to source name and all the settings to default. - -### Automatically load schema file stored with source python module -If no schema instance is passed, and a file with a name `{source name}_schema.yml` exists in the same folder as the module with the decorated function, it will be automatically loaded and used as the schema. - -This should make easier to bundle a fully specified (or non trivially configured) schema with a source. - -### Schema is modified in the source function body -What if you can configure your schema or add some tables only inside your schema function, when ie. you have the source credentials and user settings? You could for example add detailed schemas of all the database tables when someone requests a table data to be loaded. This information is available only at the moment source function is called. - -Similarly to the `state`, source and resource function has current schema available via `dlt.current.source_schema` - -Example: - -```python - -# apply schema to the source -@dlt.source -def createx(nesting_level: int): - - schema = dlt.current.source_schema() - - # get default normalizer config - normalizer_conf = dlt.schema.normalizer_config() - # set hash names convention which produces short names without clashes but very ugly - if short_names_convention: - normalizer_conf["names"] = dlt.common.normalizers.names.hash_names - - # apply normalizer conf - schema = Schema("createx", normalizer_conf) - # set nesting level, yeah it's ugly - schema._normalizers_config["json"].setdefault("config", {})["max_nesting"] = nesting_level - # remove date detector and add type detector that forces all fields to strings - schema._settings["detections"].remove("iso_timestamp") - schema._settings["detections"].insert(0, "all_text") - schema.compile_settings() - - return dlt.resource(...) - -``` - -Also look at the following [test](/tests/extract/test_decorators.py) : `test_source_schema_context` diff --git a/docs/website/blog/2023-09-05-mongo-etl.md b/docs/website/blog/2023-09-05-mongo-etl.md index cd102c8895..8dfd953be4 100644 --- a/docs/website/blog/2023-09-05-mongo-etl.md +++ b/docs/website/blog/2023-09-05-mongo-etl.md @@ -168,7 +168,7 @@ Here's a code explanation of how it works under the hood: pipeline_name='from_json', destination='duckdb', dataset_name='mydata', - full_refresh=True, + dev_mode=True, ) # dlt works with lists of dicts, so wrap data to the list load_info = pipeline.run([data], table_name="json_data") diff --git a/docs/website/blog/2023-10-23-arrow-loading.md b/docs/website/blog/2023-10-23-arrow-loading.md index 2cdf4d90e7..25962c932e 100644 --- a/docs/website/blog/2023-10-23-arrow-loading.md +++ b/docs/website/blog/2023-10-23-arrow-loading.md @@ -50,7 +50,7 @@ chat_messages = dlt.resource( In this demo I just extract and normalize data and skip the loading step. ```py -pipeline = dlt.pipeline(destination="duckdb", full_refresh=True) +pipeline = dlt.pipeline(destination="duckdb", dev_mode=True) # extract first pipeline.extract(chat_messages) info = pipeline.normalize() @@ -98,7 +98,7 @@ chat_messages = dlt.resource( write_disposition="append", )("postgresql://loader:loader@localhost:5432/dlt_data") -pipeline = dlt.pipeline(destination="duckdb", full_refresh=True) +pipeline = dlt.pipeline(destination="duckdb", dev_mode=True) # extract first pipeline.extract(chat_messages) info = pipeline.normalize(workers=3, loader_file_format="parquet") diff --git a/docs/website/blog/2023-12-01-dlt-kestra-demo.md b/docs/website/blog/2023-12-01-dlt-kestra-demo.md index 9f1d7acba2..1b1c79562d 100644 --- a/docs/website/blog/2023-12-01-dlt-kestra-demo.md +++ b/docs/website/blog/2023-12-01-dlt-kestra-demo.md @@ -45,7 +45,7 @@ Wanna jump to the [GitHub repo](https://github.com/dlt-hub/dlt-kestra-demo)? ## HOW IT WORKS -To lay it all out clearly: Everything's automated in **`Kestra`**, with hassle-free data loading thanks to **`dlt`**, and the analytical thinking handled by OpenAI. Here's a diagram to help you understand the general outline of the entire process. +To lay it all out clearly: Everything's automated in **`Kestra`**, with hassle-free data loading thanks to **`dlt`**, and the analytical thinking handled by OpenAI. Here's a diagram to help you understand the general outline of the entire process. ![overview](https://storage.googleapis.com/dlt-blog-images/dlt_kestra_workflow_overview.png) @@ -59,12 +59,12 @@ Once you’ve opened [http://localhost:8080/](http://localhost:8080/) in your br ![Kestra](https://storage.googleapis.com/dlt-blog-images/dlt_kestra_kestra_ui.png) -Now, all you need to do is [create your flows](https://github.com/dlt-hub/dlt-kestra-demo/blob/main/README.md) and execute them. +Now, all you need to do is [create your flows](https://github.com/dlt-hub/dlt-kestra-demo/blob/main/README.md) and execute them. The great thing about **`Kestra`** is its ease of use - it's UI-based, declarative, and language-agnostic. Unless you're using a task like a [Python script](https://kestra.io/plugins/plugin-script-python/tasks/io.kestra.plugin.scripts.python.script), you don't even need to know how to code. -:::tip +:::tip If you're already considering ways to use **`Kestra`** for your projects, consult their [documentation](https://kestra.io/docs) and the [plugin](https://kestra.io/plugins) pages for further insights. ::: @@ -84,7 +84,7 @@ pipeline = dlt.pipeline( pipeline_name="standard_inbox", destination='bigquery', dataset_name="messages_data", - full_refresh=False, + dev_mode=False, ) # Set table name diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 93291bfe9a..a723e3554c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -141,7 +141,7 @@ For every table created as an iceberg table, the Athena destination will create The `merge` write disposition is supported for Athena when using iceberg tables. > Note that: -> 1. there is a risk of tables ending up in inconsistent state in case a pipeline run fails mid flight, because Athena doesn't support transactions, and `dlt` uses multiple DELETE/UPDATE/INSERT statements to implement `merge`, +> 1. there is a risk of tables ending up in inconsistent state in case a pipeline run fails mid flight, because Athena doesn't support transactions, and `dlt` uses multiple DELETE/UPDATE/INSERT statements to implement `merge`, > 2. `dlt` creates additional helper tables called `insert_` and `delete_
` in the staging schema to work around Athena's lack of temporary tables. ### dbt support @@ -183,7 +183,7 @@ Here is an example of how to use the adapter to partition a table: from datetime import date import dlt -from dlt.destinations.impl.athena.athena_adapter import athena_partition, athena_adapter +from dlt.destinations.adapters import athena_partition, athena_adapter data_items = [ (1, "A", date(2021, 1, 1)), diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 4f99901e37..f97a4a96bb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -232,7 +232,7 @@ Here is an example of how to use the `bigquery_adapter` method to apply hints to from datetime import date, timedelta import dlt -from dlt.destinations.impl.bigquery.bigquery_adapter import bigquery_adapter +from dlt.destinations.adapters import bigquery_adapter @dlt.resource( diff --git a/docs/website/docs/dlt-ecosystem/destinations/dremio.md b/docs/website/docs/dlt-ecosystem/destinations/dremio.md index 546f470938..c087d5dc0a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/dremio.md +++ b/docs/website/docs/dlt-ecosystem/destinations/dremio.md @@ -86,7 +86,7 @@ Data loading happens by copying a staged parquet files from an object storage bu Dremio does not support `CREATE SCHEMA` DDL statements. -Therefore, "Metastore" data sources, such as Hive or Glue, require that the dataset schema exists prior to running the dlt pipeline. `full_refresh=True` is unsupported for these data sources. +Therefore, "Metastore" data sources, such as Hive or Glue, require that the dataset schema exists prior to running the dlt pipeline. `dev_mode=True` is unsupported for these data sources. "Object Storage" data sources do not have this limitation. diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index d6ec36ae49..1e3d6b8403 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -51,7 +51,7 @@ or via the env variable `SCHEMA__NAMING` or directly in the code: dlt.config["schema.naming"] = "duck_case" ``` :::caution -**duckdb** identifiers are **case insensitive** but display names preserve case. This may create name clashes if, for example, you load JSON with +**duckdb** identifiers are **case insensitive** but display names preserve case. This may create name collisions if, for example, you load JSON with `{"Column": 1, "column": 2}` as it will map data to a single column. ::: diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index ae504728c3..49b3c06208 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -105,6 +105,28 @@ The Postgres destination creates UNIQUE indexes by default on columns with the ` create_indexes=false ``` +### Setting up `csv` format +You can provide [non-default](../file-formats/csv.md#default-settings) csv settings via configuration file or explicitly. +```toml +[destination.postgres.csv_format] +delimiter="|" +include_header=false +``` +or +```py +from dlt.destinations import postgres +from dlt.common.data_writers.configuration import CsvFormatConfiguration + +csv_format = CsvFormatConfiguration(delimiter="|", include_header=False) + +dest_ = postgres(csv_format=csv_format) +``` +Above we set `csv` file without header, with **|** as a separator. + +:::tip +You'll need those setting when [importing external files](../../general-usage/resource.md#import-external-files) +::: + ### dbt support This destination [integrates with dbt](../transformations/dbt/dbt.md) via dbt-postgres. diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index 7e0679ec6b..ab193c755d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -97,6 +97,12 @@ Amazon Redshift supports the following column hints: Redshift supports s3 as a file staging destination. dlt will upload files in the parquet format to s3 and ask Redshift to copy their data directly into the db. Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the AWS credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. +## Identifier names and case sensitivity +* Up to 127 characters +* Case insensitive +* Stores identifiers in lower case +* Has case sensitive mode, if enabled you must [enable case sensitivity in destination factory](../../general-usage/destination.md#control-how-dlt-creates-table-column-and-other-identifiers) + ### Authentication IAM Role If you would like to load from s3 without forwarding the AWS staging credentials but authorize with an IAM role connected to Redshift, follow the [Redshift documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/authorizing-redshift-service.html) to create a role with access to s3 linked to your Redshift cluster and change your destination settings to use the IAM role: diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index 513c951f78..b92d242c8a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -141,12 +141,30 @@ The data is loaded using an internal Snowflake stage. We use the `PUT` command a * [insert-values](../file-formats/insert-format.md) is used by default * [parquet](../file-formats/parquet.md) is supported * [jsonl](../file-formats/jsonl.md) is supported +* [csv](../file-formats/csv.md) is supported When staging is enabled: * [jsonl](../file-formats/jsonl.md) is used by default * [parquet](../file-formats/parquet.md) is supported +* [csv](../file-formats/csv.md) is supported -> ❗ When loading from `parquet`, Snowflake will store `complex` types (JSON) in `VARIANT` as a string. Use the `jsonl` format instead or use `PARSE_JSON` to update the `VARIANT` field after loading. +:::caution +When loading from `parquet`, Snowflake will store `complex` types (JSON) in `VARIANT` as a string. Use the `jsonl` format instead or use `PARSE_JSON` to update the `VARIANT` field after loading. +::: + +### Custom csv formats +By default we support csv format [produced by our writers](../file-formats/csv.md#default-settings) which is comma delimited, with header and optionally quoted. + +You can configure your own formatting ie. when [importing](../../general-usage/resource.md#import-external-files) external `csv` files. +```toml +[destination.snowflake.csv_format] +delimiter="|" +include_header=false +on_error_continue=true +``` +Which will read, `|` delimited file, without header and will continue on errors. + +Note that we ignore missing columns `ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE` and we will insert NULL into them. ## Supported column hints Snowflake supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): @@ -265,6 +283,29 @@ stage_name="DLT_STAGE" keep_staged_files=true ``` +### Setting up `csv` format +You can provide [non-default](../file-formats/csv.md#default-settings) csv settings via configuration file or explicitly. +```toml +[destination.snowflake.csv_format] +delimiter="|" +include_header=false +on_error_continue=true +``` +or +```py +from dlt.destinations import snowflake +from dlt.common.data_writers.configuration import CsvFormatConfiguration + +csv_format = CsvFormatConfiguration(delimiter="|", include_header=False, on_error_continue=True) + +dest_ = snowflake(csv_format=csv_format) +``` +Above we set `csv` file without header, with **|** as a separator and we request to ignore lines with errors. + +:::tip +You'll need those setting when [importing external files](../../general-usage/resource.md#import-external-files) +::: + ### dbt support This destination [integrates with dbt](../transformations/dbt/dbt.md) via [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake). Both password and key pair authentication are supported and shared with dbt runners. diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index 2e936f193e..6cfcb1ef8f 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -148,6 +148,8 @@ Data is loaded via `INSERT` statements by default. The [table index type](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index) of the created tables can be configured at the resource level with the `synapse_adapter`: ```py +from dlt.destinations.adapters import synapse_adapter + info = pipeline.run( synapse_adapter( data=your_resource, diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 11d1276ceb..c6597fadce 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -252,7 +252,7 @@ it will be normalized to: so your best course of action is to clean up the data yourself before loading and use the default naming convention. Nevertheless, you can configure the alternative in `config.toml`: ```toml [schema] -naming="dlt.destinations.weaviate.impl.ci_naming" +naming="dlt.destinations.impl.weaviate.ci_naming" ``` ## Additional destination options diff --git a/docs/website/docs/dlt-ecosystem/file-formats/csv.md b/docs/website/docs/dlt-ecosystem/file-formats/csv.md index 4a57a0e2d6..02a7e81def 100644 --- a/docs/website/docs/dlt-ecosystem/file-formats/csv.md +++ b/docs/website/docs/dlt-ecosystem/file-formats/csv.md @@ -16,7 +16,7 @@ Internally we use two implementations: ## Supported Destinations -Supported by: **Postgres**, **Filesystem** +Supported by: **Postgres**, **Filesystem**, **snowflake** By setting the `loader_file_format` argument to `csv` in the run command, the pipeline will store your data in the csv format at the destination: @@ -28,11 +28,23 @@ info = pipeline.run(some_source(), loader_file_format="csv") `dlt` attempts to make both writers to generate similarly looking files * separators are commas * quotes are **"** and are escaped as **""** -* `NULL` values are empty strings +* `NULL` values both are empty strings and empty tokens as in the example below * UNIX new lines are used * dates are represented as ISO 8601 * quoting style is "when needed" +Example of NULLs: +```sh +text1,text2,text3 +A,B,C +A,,"" +``` + +In the last row both `text2` and `text3` values are NULL. Python `csv` writer +is not able to write unquoted `None` values so we had to settle for `""` + +Note: all destinations capable of writing csvs must support it. + ### Change settings You can change basic **csv** settings, this may be handy when working with **filesystem** destination. Other destinations are tested with standard settings: @@ -59,6 +71,15 @@ NORMALIZE__DATA_WRITER__INCLUDE_HEADER=False NORMALIZE__DATA_WRITER__QUOTING=quote_all ``` +### Destination settings +A few additional settings are available when copying `csv` to destination tables: +* **on_error_continue** - skip lines with errors (only Snowflake) +* **encoding** - encoding of the `csv` file + +:::tip +You'll need those setting when [importing external files](../../general-usage/resource.md#import-external-files) +::: + ## Limitations **arrow writer** diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index 7b957e98ea..9cd6ad8079 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -355,7 +355,7 @@ To read more about tables, columns, and datatypes, please refer to [our document `dlt` will **not modify** tables after they are created. So if you changed data types with hints, then you need to **delete the dataset** -or set `full_refresh=True`. +or set `dev_mode=True`. ::: ## Sources and resources diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md index fde7a64144..36a8569a4a 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database.md @@ -271,7 +271,7 @@ pipeline = dlt.pipeline( pipeline_name="unsw_download", destination=filesystem(os.path.abspath("../_storage/unsw")), progress="log", - full_refresh=True, + dev_mode=True, ) info = pipeline.run( diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md index 760daa2fee..b30403d349 100644 --- a/docs/website/docs/general-usage/destination.md +++ b/docs/website/docs/general-usage/destination.md @@ -18,26 +18,27 @@ We recommend that you declare the destination type when creating a pipeline inst Above we want to use **filesystem** built-in destination. You can use shorthand types only for built-ins. -* Use full **destination class type** +* Use full **destination factory type** -Above we use built in **filesystem** destination by providing a class type `filesystem` from module `dlt.destinations`. You can pass [destinations from external modules](#declare-external-destination) as well. +Above we use built in **filesystem** destination by providing a factory type `filesystem` from module `dlt.destinations`. You can pass [destinations from external modules](#declare-external-destination) as well. -* Import **destination class** +* Import **destination factory** -Above we import destination class for **filesystem** and pass it to the pipeline. +Above we import destination factory for **filesystem** and pass it to the pipeline. -All examples above will create the same destination class with default parameters and pull required config and secret values from [configuration](credentials/configuration.md) - they are equivalent. +All examples above will create the same destination factory with default parameters and pull required config and secret values from [configuration](credentials/configuration.md) - they are equivalent. ### Pass explicit parameters and a name to a destination -You can instantiate **destination class** yourself to configure it explicitly. When doing this you work with destinations the same way you work with [sources](source.md) +You can instantiate **destination factory** yourself to configure it explicitly. When doing this you work with destinations the same way you work with [sources](source.md) -Above we import and instantiate the `filesystem` destination class. We pass explicit url of the bucket and name the destination to `production_az_bucket`. +Above we import and instantiate the `filesystem` destination factory. We pass explicit url of the bucket and name the destination to `production_az_bucket`. + +If destination is not named, its shorthand type (the Python factory name) serves as a destination name. Name your destination explicitly if you need several separate configurations of destinations of the same type (i.e. you wish to maintain credentials for development, staging and production storage buckets in the same config file). Destination name is also stored in the [load info](../running-in-production/running.md#inspect-and-save-the-load-info-and-trace) and pipeline traces so use them also when you need more descriptive names (other than, for example, `filesystem`). -If destination is not named, its shorthand type (the Python class name) serves as a destination name. Name your destination explicitly if you need several separate configurations of destinations of the same type (i.e. you wish to maintain credentials for development, staging and production storage buckets in the same config file). Destination name is also stored in the [load info](../running-in-production/running.md#inspect-and-save-the-load-info-and-trace) and pipeline traces so use them also when you need more descriptive names (other than, for example, `filesystem`). ## Configure a destination We recommend to pass the credentials and other required parameters to configuration via TOML files, environment variables or other [config providers](credentials/config_providers.md). This allows you, for example, to easily switch to production destinations after deployment. @@ -59,7 +60,7 @@ For named destinations you use their names in the config section Note that when you use [`dlt init` command](../walkthroughs/add-a-verified-source.md) to create or add a data source, `dlt` creates a sample configuration for selected destination. ### Pass explicit credentials -You can pass credentials explicitly when creating destination class instance. This replaces the `credentials` argument in `dlt.pipeline` and `pipeline.load` methods - which is now deprecated. You can pass the required credentials object, its dictionary representation or the supported native form like below: +You can pass credentials explicitly when creating destination factory instance. This replaces the `credentials` argument in `dlt.pipeline` and `pipeline.load` methods - which is now deprecated. You can pass the required credentials object, its dictionary representation or the supported native form like below: @@ -74,6 +75,23 @@ You can create and pass partial credentials and `dlt` will fill the missing data Please read how to use [various built in credentials types](credentials/config_specs.md). ::: +### Inspect destination capabilities +[Destination capabilities](../walkthroughs/create-new-destination.md#3-set-the-destination-capabilities) tell `dlt` what given destination can and cannot do. For example it tells which file formats it can load, what is maximum query or identifier length. Inspect destination capabilities as follows: +```py +import dlt +pipeline = dlt.pipeline("snowflake_test", destination="snowflake") +print(dict(pipeline.destination.capabilities())) +``` + +### Pass additional parameters and change destination capabilities +Destination factory accepts additional parameters that will be used to pre-configure it and change destination capabilities. +```py +import dlt +duck_ = dlt.destinations.duckdb(naming_convention="duck_case", recommended_file_size=120000) +print(dict(duck_.capabilities())) +``` +Example above is overriding `naming_convention` and `recommended_file_size` in the destination capabilities. + ### Configure multiple destinations in a pipeline To configure multiple destinations within a pipeline, you need to provide the credentials for each destination in the "secrets.toml" file. This example demonstrates how to configure a BigQuery destination named `destination_one`: @@ -86,7 +104,7 @@ private_key = "please set me up!" client_email = "please set me up!" ``` -You can then use this destination in your pipeline as follows: +You can then use this destination in your pipeline as follows: ```py import dlt from dlt.common.destination import Destination @@ -117,6 +135,56 @@ Obviously, dlt will access the destination when you instantiate [sql_client](../ ::: +## Control how `dlt` creates table, column and other identifiers +`dlt` maps identifiers found in the source data into destination identifiers (ie. table and columns names) using [naming conventions](naming-convention.md) which ensure that +character set, identifier length and other properties fit into what given destination can handle. For example our [default naming convention (**snake case**)](naming-convention.md#default-naming-convention-snake_case) converts all names in the source (ie. JSON document fields) into snake case, case insensitive identifiers. + +Each destination declares its preferred naming convention, support for case sensitive identifiers and case folding function that case insensitive identifiers follow. For example: +1. Redshift - by default does not support case sensitive identifiers and converts all of them to lower case. +2. Snowflake - supports case sensitive identifiers and considers upper cased identifiers as case insensitive (which is the default case folding) +3. DuckDb - does not support case sensitive identifiers but does not case fold them so it preserves the original casing in the information schema. +4. Athena - does not support case sensitive identifiers and converts all of them to lower case. +5. BigQuery - all identifiers are case sensitive, there's no case insensitive mode available via case folding (but it can be enabled in dataset level). + +You can change the naming convention used in [many different ways](naming-convention.md#configure-naming-convention), below we set the preferred naming convention on the Snowflake destination to `sql_cs` to switch Snowflake to case sensitive mode: +```py +import dlt +snow_ = dlt.destinations.snowflake(naming_convention="sql_cs_v1") +``` +Setting naming convention will impact all new schemas being created (ie. on first pipeline run) and will re-normalize all existing identifiers. + +:::caution +`dlt` prevents re-normalization of identifiers in tables that were already created at the destination. Use [refresh](pipeline.md#refresh-pipeline-data-and-state) mode to drop the data. You can also disable this behavior via [configuration](naming-convention.md#avoid-identifier-collisions) +::: + +:::note +Destinations that support case sensitive identifiers but use case folding convention to enable case insensitive identifiers are configured in case insensitive mode by default. Examples: Postgres, Snowflake, Oracle. +::: + +:::caution +If you use case sensitive naming convention with case insensitive destination, `dlt` will: +1. Fail the load if it detects identifier collision due to case folding +2. Warn if any case folding is applied by the destination. +::: + +### Enable case sensitive identifiers support +Selected destinations may be configured so they start accepting case sensitive identifiers. For example, it is possible to set case sensitive collation on **mssql** database and then tell `dlt` about it. +```py +from dlt.destinations import mssql +dest_ = mssql(has_case_sensitive_identifiers=True, naming_convention="sql_cs_v1") +``` +Above we can safely use case sensitive naming convention without worrying of name collisions. + +You can configure the case sensitivity, **but configuring destination capabilities is not currently supported**. +```toml +[destination.mssql] +has_case_sensitive_identifiers=true +``` + +:::note +In most cases setting the flag above just indicates to `dlt` that you switched the case sensitive option on a destination. `dlt` will not do that for you. Refer to destination documentation for details. +::: + ## Create new destination You have two ways to implement a new destination: 1. You can use `@dlt.destination` decorator and [implement a sink function](../dlt-ecosystem/destinations/destination.md). This is perfect way to implement reverse ETL destinations that push data back to REST APIs. diff --git a/docs/website/docs/general-usage/naming-convention.md b/docs/website/docs/general-usage/naming-convention.md new file mode 100644 index 0000000000..c24b6c4869 --- /dev/null +++ b/docs/website/docs/general-usage/naming-convention.md @@ -0,0 +1,113 @@ +--- +title: Naming Convention +description: Control how dlt creates table, column and other identifiers +keywords: [identifiers, snake case, case sensitive, case insensitive, naming] +--- + +# Naming Convention +`dlt` creates table and column identifiers from the data. The data source that ie. a stream of JSON documents may have identifiers (i.e. key names in a dictionary) with any Unicode characters, of any length and naming style. On the other hand, destinations require that you follow strict rules when you name tables, columns or collections. +A good example is [Redshift](../dlt-ecosystem/destinations/redshift.md#naming-convention) that accepts case-insensitive alphanumeric identifiers with maximum 127 characters. + +`dlt` groups tables from a single [source](source.md) in a [schema](schema.md). + +Each schema defines **naming convention** that tells `dlt` how to translate identifiers to the +namespace that the destination understands. Naming conventions are in essence functions that map strings from the source identifier format into destination identifier format. For example our **snake_case** (default) naming convention will translate `DealFlow` into `deal_flow` identifier. + +You can pick which naming convention to use. `dlt` provides a few to [choose from](#available-naming-conventions) or you can [easily add your own](#write-your-own-naming-convention). + +:::tip +* Standard behavior of `dlt` is to **use the same naming convention for all destinations** so users see always the same tables and column names in their databases. +* Use simple, short small caps identifiers for everything so no normalization is needed +::: + +### Use default naming convention (snake_case) +`dlt` most used and tested with default, case insensitive, lower case naming convention called **snake_case** + +1. Converts identifiers to **snake_case**, small caps. Removes all ascii characters except ascii + alphanumerics and underscores. +1. Adds `_` if name starts with number. +1. Multiples of `_` are converted into single `_`. +1. The parent-child relation is expressed as double `_` in names. +1. It shorts the identifier if it exceed the length at the destination. + +> 💡 Standard behavior of `dlt` is to **use the same naming convention for all destinations** so +> users see always the same tables and columns in their databases. + +> 💡 If you provide any schema elements that contain identifiers via decorators or arguments (i.e. +> `table_name` or `columns`) all the names used will be converted via the naming convention when +> adding to the schema. For example if you execute `dlt.run(... table_name="CamelCase")` the data +> will be loaded into `camel_case`. + +> 💡 Use simple, short small caps identifiers for everything! + +:::tip +If you do not like **snake_case** your next safe option is **sql_ci** which generates SQL-safe, lower-case, case-insensitive identifiers without any +other transformations. To permanently change the default naming convention on a given machine: +1. set an environment variable `SCHEMA__NAMING` to `sql_ci_v1` OR +2. add the following line to your global `config.toml` (the one in your home dir ie. `~/.dlt/config.toml`) +```toml +[schema] +naming="sql_ci_v1" +``` +::: + +## Source identifiers vs destination identifiers +### Pick the right identifier form when defining resources +`dlt` keeps source (not normalized) identifiers during data [extraction](../reference/explainers/how-dlt-works.md#extract) and translates them during [normalization](../reference/explainers/how-dlt-works.md#normalize). For you it means: +1. If you write a [transformer](resource.md#process-resources-with-dlttransformer) or a [mapping/filtering function](resource.md#filter-transform-and-pivot-data), you will see the original data, without any normalization. Use the source key names to access the dicts! +2. If you define a `primary_key` or `cursor` that participate in [incremental loading](incremental-loading.md#incremental-loading-with-a-cursor-field) use the source identifiers (as `dlt` will inspect the source data). +3. When defining any other hints ie. `columns` or `merge_key` you can pick source or destination identifiers. `dlt` normalizes all hints together with your data. +4. `Schema` object (ie. obtained from the pipeline or from `dlt` source via `discover_schema`) **always contains destination (normalized) identifiers**. + +In the snippet below, we define a resource with various "illegal" unicode characters in table name and other hint and demonstrate how they get normalized in the schema object. +```py +``` + +### Understand the identifier normalization +Identifiers are translated from source to destination form in **normalize** step. Here's how `dlt` picks the right naming convention: + +* Each destination has a preferred naming convention. +* This naming convention is used when new schemas are created. +* Schemas preserve naming convention when saved +* `dlt` applies final naming convention in `normalize` step. Naming convention comes from (1) explicit configuration (2) from destination capabilities. Naming convention +in schema will be ignored. +* You can change the naming convention in the capabilities: (name, case-folding, case sensitivity) + +### Case sensitive and insensitive destinations +Naming conventions come in two types. +* **case sensitive** naming convention normalize source identifiers into case sensitive identifiers where character +* **case insensitive** + +Case sensitive naming convention will put a destination in [case sensitive mode](destination.md#control-how-dlt-creates-table-column-and-other-identifiers). Identifiers that +differ only in casing will not [collide](#avoid-identifier-collisions). Note that many destinations are exclusively case insensitive, of which some preserve casing of identifiers (ie. **duckdb**) and some will case-fold identifiers when creating tables (ie. **Redshift**, **Athena** do lower case on the names). + +## Identifier shortening +Identifier shortening happens during normalization. `dlt` takes the maximum length of the identifier from the destination capabilities and will trim the identifiers that are +too long. The default shortening behavior generates short deterministic hashes of the source identifiers and places them in the middle of the destination identifier. This +(with a high probability) avoids shortened identifier collisions. + + +## Pick your own naming convention + +### Configure naming convention +The naming convention is configurable and users can easily create their own +conventions that i.e. pass all the identifiers unchanged if the destination accepts that (i.e. +DuckDB). + + +### Available naming conventions + +### Set and adjust naming convention explicitly + +## Avoid identifier collisions + + +`dlt` detects various types of collisions and ignores the others. + + +## Write your own naming convention +Naming conventions reside in separate Python modules, are classes with `NamingConvention` name and must derive from `BaseNamingConvention`. We include two examples of +naming conventions that you may find useful + +1. A variant of `sql_ci` that generates identifier collisions with a low (user defined) probability by appending a deterministic tag to each name. +2. A variant of `sql_cs` that allows for LATIN-2 (ie. umlaut) characters diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index ac7f7e6b38..14f8d73b58 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -488,6 +488,59 @@ be adjusted after the `batch` is processed in the extract pipeline but before an You can emit columns as Pydantic model and use dynamic hints (ie. lambda for table name) as well. You should avoid redefining `Incremental` this way. ::: +### Import external files +You can import external files ie. `csv`, `parquet` and `jsonl` by yielding items marked with `with_file_import`, optionally passing table schema corresponding +the the imported file. `dlt` will not read, parse and normalize any names (ie. `csv` or `arrow` headers) and will attempt to copy the file into the destination as is. +```py +import os +import dlt + +from filesystem import filesystem + +columns: List[TColumnSchema] = [ + {"name": "id", "data_type": "bigint"}, + {"name": "name", "data_type": "text"}, + {"name": "description", "data_type": "text"}, + {"name": "ordered_at", "data_type": "date"}, + {"name": "price", "data_type": "decimal"}, +] + +import_folder = "/tmp/import" + +@dlt.transformer(columns=columns) +def orders(items: Iterator[FileItemDict]): + for item in items: + # copy file locally + dest_file = os.path.join(import_folder, item["file_name"]) + # download file + item.fsspec.download(item["file_url"], dest_file) + # tell dlt to import the dest_file as `csv` + yield dlt.mark.with_file_import(dest_file, "csv") + + +# use filesystem verified source to glob a bucket +downloader = filesystem( + bucket_url="s3://my_bucket/csv", + file_glob="today/*.csv.gz") | orders + +info = pipeline.run(orders, destination="snowflake") +``` +In the example above, we glob all zipped csv files present on **my_bucket/csv/today** (using `filesystem` verified source) and send file descriptors to `orders` transformer. Transformer downloads and imports the files into extract package. At the end, `dlt` sends them to snowflake (the table will be created because we use `column` hints to define the schema). + +If imported `csv` files are not in `dlt` [default format](../dlt-ecosystem/file-formats/csv.md#default-settings), you may need to pass additional configuration. +```toml +[destination.snowflake.csv_format] +delimiter="|" +include_header=false +on_error_continue=true +``` + +You can sniff the schema from the data ie. using `duckdb` to infer the table schema from `csv` file. `dlt.mark.with_file_import` accepts additional arguments that you can use to pass hints at run time. + +:::note +* If you do not define any columns, the table will not be created in the destination. `dlt` will still attempt to load data into it, so you create a fitting table upfront, the load process will succeed. +* Files are imported using hard links if possible to avoid copying and duplicating storage space needed. +::: ### Duplicate and rename resources There are cases when you your resources are generic (ie. bucket filesystem) and you want to load several instances of it (ie. files from different folders) to separate tables. In example below we use `filesystem` source to load csvs from two different folders into separate tables: @@ -538,12 +591,30 @@ pipeline.run(generate_rows(10)) # load a list of resources pipeline.run([generate_rows(10), generate_rows(20)]) ``` + +### Pick loader file format for a particular resource +You can request a particular loader file format to be used for a resource. +```py +@dlt.resource(file_format="parquet") +def generate_rows(nr): + for i in range(nr): + yield {'id':i, 'example_string':'abc'} +``` +Resource above will be saved and loaded from a `parquet` file (if destination supports it). + +:::note +A special `file_format`: **preferred** will load resource using a format that is preferred by a destination. This settings supersedes the `loader_file_format` passed to `run` method. +::: + ### Do a full refresh -To do a full refresh of an `append` or `merge` resources you temporarily change the write -disposition to replace. You can use `apply_hints` method of a resource or just provide alternative -write disposition when loading: +To do a full refresh of an `append` or `merge` resources you set the `refresh` argument on `run` method to `drop_data`. This will truncate the tables without dropping them. + +```py +p.run(merge_source(), refresh="drop_data") +``` +You can also [fully drop the tables](pipeline.md#refresh-pipeline-data-and-state) in the `merge_source`: ```py -p.run(merge_source(), write_disposition="replace") +p.run(merge_source(), refresh="drop_sources") ``` diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 989b023b01..0e3e3bba1f 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -42,8 +42,9 @@ characters, any lengths and naming styles. On the other hand the destinations ac namespaces for their identifiers. Like Redshift that accepts case-insensitive alphanumeric identifiers with maximum 127 characters. -Each schema contains `naming convention` that tells `dlt` how to translate identifiers to the -namespace that the destination understands. +Each schema contains [naming convention](naming-convention.md) that tells `dlt` how to translate identifiers to the +namespace that the destination understands. This convention can be configured, changed in code or enforced via +destination. The default naming convention: @@ -214,7 +215,7 @@ The precision for **bigint** is mapped to available integer types ie. TINYINT, I ## Schema settings The `settings` section of schema file lets you define various global rules that impact how tables -and columns are inferred from data. +and columns are inferred from data. For example you can assign **primary_key** hint to all columns with name `id` or force **timestamp** data type on all columns containing `timestamp` with an use of regex pattern. > 💡 It is the best practice to use those instead of providing the exact column schemas via `columns` > argument or by pasting them in `yaml`. @@ -222,8 +223,9 @@ and columns are inferred from data. ### Data type autodetectors You can define a set of functions that will be used to infer the data type of the column from a -value. The functions are run from top to bottom on the lists. Look in [`detections.py`](https://github.com/dlt-hub/dlt/blob/devel/dlt/common/schema/detections.py) to see what is -available. +value. The functions are run from top to bottom on the lists. Look in `detections.py` to see what is +available. **iso_timestamp** detector that looks for ISO 8601 strings and converts them to **timestamp** +is enabled by default. ```yaml settings: @@ -236,12 +238,24 @@ settings: - wei_to_double ``` +Alternatively you can add and remove detections from code: +```py + source = data_source() + # remove iso time detector + source.schema.remove_type_detection("iso_timestamp") + # convert UNIX timestamp (float, withing a year from NOW) into timestamp + source.schema.add_type_detection("timestamp") +``` +Above we modify a schema that comes with a source to detect UNIX timestamps with **timestamp** detector. + ### Column hint rules You can define a global rules that will apply hints of a newly inferred columns. Those rules apply -to normalized column names. You can use column names directly or with regular expressions. +to normalized column names. You can use column names directly or with regular expressions. `dlt` is matching +the column names **after they got normalized with naming convention**. -Example from ethereum schema: +By default, schema adopts hints rules from json(relational) normalizer to support correct hinting +of columns added by normalizer: ```yaml settings: @@ -249,36 +263,59 @@ settings: foreign_key: - _dlt_parent_id not_null: - - re:^_dlt_id$ + - _dlt_id - _dlt_root_id - _dlt_parent_id - _dlt_list_idx + - _dlt_load_id unique: - _dlt_id - cluster: - - block_hash + root_key: + - _dlt_root_id +``` +Above we require exact column name match for a hint to apply. You can also use regular expression (which we call `SimpleRegex`) as follows: +```yaml +settings: partition: - - block_timestamp + - re:_timestamp$ +``` +Above we add `partition` hint to all columns ending with `_timestamp`. You can do same thing in the code +```py + source = data_source() + # this will update existing hints with the hints passed + source.schema.merge_hints({"partition": ["re:_timestamp$"]}) ``` ### Preferred data types You can define rules that will set the data type for newly created columns. Put the rules under `preferred_types` key of `settings`. On the left side there's a rule on a column name, on the right -side is the data type. - -> ❗See the column hint rules for naming convention! +side is the data type. You can use column names directly or with regular expressions. +`dlt` is matching the column names **after they got normalized with naming convention**. Example: ```yaml settings: preferred_types: - timestamp: timestamp - re:^inserted_at$: timestamp - re:^created_at$: timestamp - re:^updated_at$: timestamp - re:^_dlt_list_idx$: bigint + re:timestamp: timestamp + inserted_at: timestamp + created_at: timestamp + updated_at: timestamp +``` + +Above we prefer `timestamp` data type for all columns containing **timestamp** substring and define a few exact matches ie. **created_at**. +Here's same thing in code +```py + source = data_source() + source.schema.update_preferred_types( + { + "re:timestamp": "timestamp", + "inserted_at": "timestamp", + "created_at": "timestamp", + "updated_at": "timestamp", + } + ) ``` ### Applying data types directly with `@dlt.resource` and `apply_hints` `dlt` offers the flexibility to directly apply data types and hints in your code, bypassing the need for importing and adjusting schemas. This approach is ideal for rapid prototyping and handling data sources with dynamic schema requirements. @@ -364,7 +401,6 @@ def textual(nesting_level: int): schema.remove_type_detection("iso_timestamp") # convert UNIX timestamp (float, withing a year from NOW) into timestamp schema.add_type_detection("timestamp") - schema._compile_settings() return dlt.resource([]) ``` diff --git a/docs/website/docs/reference/performance_snippets/toml-snippets.toml b/docs/website/docs/reference/performance_snippets/toml-snippets.toml index 5e700c4e31..e1a640e7cf 100644 --- a/docs/website/docs/reference/performance_snippets/toml-snippets.toml +++ b/docs/website/docs/reference/performance_snippets/toml-snippets.toml @@ -71,7 +71,7 @@ max_parallel_items=10 # @@@DLT_SNIPPET_START normalize_workers_toml - [extract.data_writer] +[extract.data_writer] # force extract file rotation if size exceeds 1MiB file_max_bytes=1000000 diff --git a/docs/website/docs/walkthroughs/create-new-destination.md b/docs/website/docs/walkthroughs/create-new-destination.md index 1b72b81e3e..69e7b2fcc1 100644 --- a/docs/website/docs/walkthroughs/create-new-destination.md +++ b/docs/website/docs/walkthroughs/create-new-destination.md @@ -88,6 +88,10 @@ The default `escape_identifier` function identifier escapes `"` and '\' and quot You should avoid providing a custom `escape_literal` function by not enabling `insert-values` for your destination. +### Enable / disable case sensitive identifiers +Specify if destination supports case sensitive identifiers by setting `has_case_sensitive_identifiers` to `True` (or `False` if otherwise). Some case sensitive destinations (ie. **Snowflake** or **Postgres**) support case insensitive identifiers via. case folding ie. **Snowflake** considers all upper case identifiers as case insensitive (set `casefold_identifier` to `str.upper`), **Postgres** does the same with lower case identifiers (`str.lower`). +Some case insensitive destinations (ie. **Athena** or **Redshift**) case-fold (ie. lower case) all identifiers and store them as such. In that case set `casefold_identifier` to `str.lower` as well. + ## 4. Adjust the SQL client **sql client** is a wrapper over `dbapi` and its main role is to provide consistent interface for executing SQL statements, managing transactions and (probably the most important) to help handling errors via classifying exceptions. Here's a few things you should pay attention to: diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index d3d7def8fc..1ea92f2e91 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -157,6 +157,7 @@ const sidebars = { 'general-usage/incremental-loading', 'general-usage/full-loading', 'general-usage/schema', + 'general-usage/naming-convention', 'general-usage/schema-contracts', 'general-usage/schema-evolution', { diff --git a/poetry.lock b/poetry.lock index 5a94993c80..894f5868bc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -2683,21 +2683,27 @@ test = ["pytest (>=6)"] [[package]] name = "fastembed" -version = "0.1.1" +version = "0.2.6" description = "Fast, light, accurate library built for retrieval embedding generation" optional = true -python-versions = ">=3.8.0,<3.12" +python-versions = "<3.13,>=3.8.0" files = [ - {file = "fastembed-0.1.1-py3-none-any.whl", hash = "sha256:131413ae52cd72f4c8cced7a675f8269dbfd1a852abade3c815e265114bcc05a"}, - {file = "fastembed-0.1.1.tar.gz", hash = "sha256:f7e524ee4f74bb8aad16be5b687d1f77f608d40e96e292c87881dc36baf8f4c7"}, + {file = "fastembed-0.2.6-py3-none-any.whl", hash = "sha256:3e18633291722087abebccccd7fcdffafef643cb22d203370d7fad4fa83c10fb"}, + {file = "fastembed-0.2.6.tar.gz", hash = "sha256:adaed5b46e19cc1bbe5f98f2b3ffecfc4d2a48d27512e28ff5bfe92a42649a66"}, ] [package.dependencies] -onnx = ">=1.11,<2.0" -onnxruntime = ">=1.15,<2.0" +huggingface-hub = ">=0.20,<0.21" +loguru = ">=0.7.2,<0.8.0" +numpy = [ + {version = ">=1.21", markers = "python_version < \"3.12\""}, + {version = ">=1.26", markers = "python_version >= \"3.12\""}, +] +onnx = ">=1.15.0,<2.0.0" +onnxruntime = ">=1.17.0,<2.0.0" requests = ">=2.31,<3.0" -tokenizers = ">=0.13,<0.14" -tqdm = ">=4.65,<5.0" +tokenizers = ">=0.15.1,<0.16.0" +tqdm = ">=4.66,<5.0" [[package]] name = "filelock" @@ -3546,6 +3552,164 @@ files = [ {file = "google_re2-1.1-1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c6c9f64b9724ec38da8e514f404ac64e9a6a5e8b1d7031c2dadd05c1f4c16fd"}, {file = "google_re2-1.1-1-cp39-cp39-win32.whl", hash = "sha256:d1b751b9ab9f8e2ab2a36d72b909281ce65f328c9115a1685acae1a2d1afd7a4"}, {file = "google_re2-1.1-1-cp39-cp39-win_amd64.whl", hash = "sha256:ac775c75cec7069351d201da4e0fb0cae4c1c5ebecd08fa34e1be89740c1d80b"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5eaefe4705b75ca5f78178a50104b689e9282f868e12f119b26b4cffc0c7ee6e"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:e35f2c8aabfaaa4ce6420b3cae86c0c29042b1b4f9937254347e9b985694a171"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:35fd189cbaaaa39c9a6a8a00164c8d9c709bacd0c231c694936879609beff516"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:60475d222cebd066c80414831c8a42aa2449aab252084102ee05440896586e6a"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:871cb85b9b0e1784c983b5c148156b3c5314cb29ca70432dff0d163c5c08d7e5"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94f4e66e34bdb8de91ec6cdf20ba4fa9fea1dfdcfb77ff1f59700d01a0243664"}, + {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1563577e2b720d267c4cffacc0f6a2b5c8480ea966ebdb1844fbea6602c7496f"}, + {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49b7964532a801b96062d78c0222d155873968f823a546a3dbe63d73f25bb56f"}, + {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2362fd70eb639a75fd0187d28b4ba7b20b3088833d8ad7ffd8693d0ba159e1c2"}, + {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86b80719636a4e21391e20a9adf18173ee6ae2ec956726fe2ff587417b5e8ba6"}, + {file = "google_re2-1.1-2-cp310-cp310-win32.whl", hash = "sha256:5456fba09df951fe8d1714474ed1ecda102a68ddffab0113e6c117d2e64e6f2b"}, + {file = "google_re2-1.1-2-cp310-cp310-win_amd64.whl", hash = "sha256:2ac6936a3a60d8d9de9563e90227b3aea27068f597274ca192c999a12d8baa8f"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d5a87b436028ec9b0f02fe19d4cbc19ef30441085cdfcdf1cce8fbe5c4bd5e9a"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:fc0d4163de9ed2155a77e7a2d59d94c348a6bbab3cff88922fab9e0d3d24faec"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:48b12d953bc796736e7831d67b36892fb6419a4cc44cb16521fe291e594bfe23"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:62c780c927cff98c1538439f0ff616f48a9b2e8837c676f53170d8ae5b9e83cb"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:04b2aefd768aa4edeef8b273327806c9cb0b82e90ff52eacf5d11003ac7a0db2"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9c90175992346519ee7546d9af9a64541c05b6b70346b0ddc54a48aa0d3b6554"}, + {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22ad9ad9d125249d6386a2e80efb9de7af8260b703b6be7fa0ab069c1cf56ced"}, + {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f70971f6ffe5254e476e71d449089917f50ebf9cf60f9cec80975ab1693777e2"}, + {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f267499529e64a4abed24c588f355ebe4700189d434d84a7367725f5a186e48d"}, + {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b632eff5e4cd44545a9c0e52f2e1becd55831e25f4dd4e0d7ec8ee6ca50858c1"}, + {file = "google_re2-1.1-2-cp311-cp311-win32.whl", hash = "sha256:a42c733036e8f242ee4e5f0e27153ad4ca44ced9e4ce82f3972938ddee528db0"}, + {file = "google_re2-1.1-2-cp311-cp311-win_amd64.whl", hash = "sha256:64f8eed4ca96905d99b5286b3d14b5ca4f6a025ff3c1351626a7df2f93ad1ddd"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5541efcca5b5faf7e0d882334a04fa479bad4e7433f94870f46272eec0672c4a"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:92309af35b6eb2d3b3dc57045cdd83a76370958ab3e0edd2cc4638f6d23f5b32"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:197cd9bcaba96d18c5bf84d0c32fca7a26c234ea83b1d3083366f4392cb99f78"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:1b896f171d29b541256cf26e10dccc9103ac1894683914ed88828ca6facf8dca"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:e022d3239b945014e916ca7120fee659b246ec26c301f9e0542f1a19b38a8744"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:2c73f8a9440873b68bee1198094377501065e85aaf6fcc0d2512c7589ffa06ca"}, + {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:901d86555bd7725506d651afaba7d71cd4abd13260aed6cfd7c641a45f76d4f6"}, + {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ce4710ff636701cfb56eb91c19b775d53b03749a23b7d2a5071bbbf4342a9067"}, + {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76a20e5ebdf5bc5d430530197e42a2eeb562f729d3a3fb51f39168283d676e66"}, + {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77c9f4d4bb1c8de9d2642d3c4b8b615858ba764df025b3b4f1310266f8def269"}, + {file = "google_re2-1.1-2-cp38-cp38-win32.whl", hash = "sha256:94bd60785bf37ef130a1613738e3c39465a67eae3f3be44bb918540d39b68da3"}, + {file = "google_re2-1.1-2-cp38-cp38-win_amd64.whl", hash = "sha256:59efeb77c0dcdbe37794c61f29c5b1f34bc06e8ec309a111ccdd29d380644d70"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:221e38c27e1dd9ccb8e911e9c7aed6439f68ce81e7bb74001076830b0d6e931d"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:d9145879e6c2e1b814445300b31f88a675e1f06c57564670d95a1442e8370c27"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c8a12f0740e2a52826bdbf95569a4b0abdf413b4012fa71e94ad25dd4715c6e5"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:9c9998f71466f4db7bda752aa7c348b2881ff688e361108fe500caad1d8b9cb2"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:0c39f69b702005963a3d3bf78743e1733ad73efd7e6e8465d76e3009e4694ceb"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:6d0ce762dee8d6617d0b1788a9653e805e83a23046c441d0ea65f1e27bf84114"}, + {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ecf3619d98c9b4a7844ab52552ad32597cdbc9a5bdbc7e3435391c653600d1e2"}, + {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9a1426a8cbd1fa004974574708d496005bd379310c4b1c7012be4bc75efde7a8"}, + {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1a30626ba48b4070f3eab272d860ef1952e710b088792c4d68dddb155be6bfc"}, + {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b9c1ffcfbc3095b6ff601ec2d2bf662988f6ea6763bc1c9d52bec55881f8fde"}, + {file = "google_re2-1.1-2-cp39-cp39-win32.whl", hash = "sha256:32ecf995a252c0548404c1065ba4b36f1e524f1f4a86b6367a1a6c3da3801e30"}, + {file = "google_re2-1.1-2-cp39-cp39-win_amd64.whl", hash = "sha256:e7865410f3b112a3609739283ec3f4f6f25aae827ff59c6bfdf806fd394d753e"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3b21f83f0a201009c56f06fcc7294a33555ede97130e8a91b3f4cae01aed1d73"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b38194b91354a38db1f86f25d09cdc6ac85d63aee4c67b43da3048ce637adf45"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e7da3da8d6b5a18d6c3b61b11cc5b66b8564eaedce99d2312b15b6487730fc76"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:aeca656fb10d8638f245331aabab59c9e7e051ca974b366dd79e6a9efb12e401"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:2069d6dc94f5fa14a159bf99cad2f11e9c0f8ec3b7f44a4dde9e59afe5d1c786"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:2319a39305a4931cb5251451f2582713418a19bef2af7adf9e2a7a0edd939b99"}, + {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb98fc131699756c6d86246f670a5e1c1cc1ba85413c425ad344cb30479b246c"}, + {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6e038986d8ffe4e269f8532f03009f229d1f6018d4ac0dabc8aff876338f6e0"}, + {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8618343ee658310e0f53bf586fab7409de43ce82bf8d9f7eb119536adc9783fd"}, + {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8140ca861cfe00602319cefe2c7b8737b379eb07fb328b51dc44584f47a2718"}, + {file = "google_re2-1.1-3-cp310-cp310-win32.whl", hash = "sha256:41f439c5c54e8a3a0a1fa2dbd1e809d3f643f862df7b16dd790f36a1238a272e"}, + {file = "google_re2-1.1-3-cp310-cp310-win_amd64.whl", hash = "sha256:fe20e97a33176d96d3e4b5b401de35182b9505823abea51425ec011f53ef5e56"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c39ff52b1765db039f690ee5b7b23919d8535aae94db7996079fbde0098c4d7"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5420be674fd164041639ba4c825450f3d4bd635572acdde16b3dcd697f8aa3ef"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ff53881cf1ce040f102a42d39db93c3f835f522337ae9c79839a842f26d97733"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:8d04600b0b53523118df2e413a71417c408f20dee640bf07dfab601c96a18a77"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:c4835d4849faa34a7fa1074098d81c420ed6c0707a3772482b02ce14f2a7c007"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:3309a9b81251d35fee15974d0ae0581a9a375266deeafdc3a3ac0d172a742357"}, + {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2b51cafee7e0bc72d0a4a454547bd8f257cde412ac9f1a2dc46a203b5e42cf4"}, + {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:83f5f1cb52f832c2297d271ee8c56cf5e9053448162e5d2223d513f729bad908"}, + {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55865a1ace92be3f7953b2e2b38b901d8074a367aa491daee43260a53a7fc6f0"}, + {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cec2167dd142e583e98c783bd0d28b8cf5a9cdbe1f7407ba4163fe3ccb613cb9"}, + {file = "google_re2-1.1-3-cp311-cp311-win32.whl", hash = "sha256:a0bc1fe96849e4eb8b726d0bba493f5b989372243b32fe20729cace02e5a214d"}, + {file = "google_re2-1.1-3-cp311-cp311-win_amd64.whl", hash = "sha256:e6310a156db96fc5957cb007dd2feb18476898654530683897469447df73a7cd"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8e63cd10ea006088b320e8c5d308da1f6c87aa95138a71c60dd7ca1c8e91927e"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:12b566830a334178733a85e416b1e0507dbc0ceb322827616fe51ef56c5154f1"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:442e18c9d46b225c1496919c16eafe8f8d9bb4091b00b4d3440da03c55bbf4ed"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:c54c00263a9c39b2dacd93e9636319af51e3cf885c080b9680a9631708326460"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:15a3caeeb327bc22e0c9f95eb76890fec8874cacccd2b01ff5c080ab4819bbec"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:59ec0d2cced77f715d41f6eafd901f6b15c11e28ba25fe0effdc1de554d78e75"}, + {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:185bf0e3441aed3840590f8e42f916e2920d235eb14df2cbc2049526803d3e71"}, + {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:586d3f2014eea5be14d8de53374d9b79fa99689160e00efa64b5fe93af326087"}, + {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc2575082de4ffd234d9607f3ae67ca22b15a1a88793240e2045f3b3a36a5795"}, + {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:59c5ad438eddb3630def394456091284d7bbc5b89351987f94f3792d296d1f96"}, + {file = "google_re2-1.1-3-cp312-cp312-win32.whl", hash = "sha256:5b9878c53f2bf16f75bf71d4ddd57f6611351408d5821040e91c53ebdf82c373"}, + {file = "google_re2-1.1-3-cp312-cp312-win_amd64.whl", hash = "sha256:4fdecfeb213110d0a85bad335a8e7cdb59fea7de81a4fe659233f487171980f9"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2dd87bacab32b709c28d0145fe75a956b6a39e28f0726d867375dba5721c76c1"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:55d24c61fe35dddc1bb484593a57c9f60f9e66d7f31f091ef9608ed0b6dde79f"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a0cf1180d908622df648c26b0cd09281f92129805ccc56a39227fdbfeab95cb4"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:09586f07f3f88d432265c75976da1c619ab7192cd7ebdf53f4ae0776c19e4b56"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:539f1b053402203576e919a06749198da4ae415931ee28948a1898131ae932ce"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:abf0bcb5365b0e27a5a23f3da403dffdbbac2c0e3a3f1535a8b10cc121b5d5fb"}, + {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:19c83e5bbed7958213eeac3aa71c506525ce54faf03e07d0b96cd0a764890511"}, + {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3348e77330ff672dc44ec01894fa5d93c409a532b6d688feac55e714e9059920"}, + {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:06b63edb57c5ce5a13eabfd71155e346b9477dc8906dec7c580d4f70c16a7e0d"}, + {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12fe57ba2914092b83338d61d8def9ebd5a2bd0fd8679eceb5d4c2748105d5c0"}, + {file = "google_re2-1.1-3-cp38-cp38-win32.whl", hash = "sha256:80796e08d24e606e675019fe8de4eb5c94bb765be13c384f2695247d54a6df75"}, + {file = "google_re2-1.1-3-cp38-cp38-win_amd64.whl", hash = "sha256:3c2257dedfe7cc5deb6791e563af9e071a9d414dad89e37ac7ad22f91be171a9"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43a0cd77c87c894f28969ac622f94b2e6d1571261dfdd785026848a25cfdc9b9"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:1038990b77fd66f279bd66a0832b67435ea925e15bb59eafc7b60fdec812b616"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fb5dda6875d18dd45f0f24ebced6d1f7388867c8fb04a235d1deab7ea479ce38"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bb1d164965c6d57a351b421d2f77c051403766a8b75aaa602324ee2451fff77f"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a072ebfa495051d07ffecbf6ce21eb84793568d5c3c678c00ed8ff6b8066ab31"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:4eb66c8398c8a510adc97978d944b3b29c91181237218841ea1a91dc39ec0e54"}, + {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f7c8b57b1f559553248d1757b7fa5b2e0cc845666738d155dff1987c2618264e"}, + {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9162f6aa4f25453c682eb176f21b8e2f40205be9f667e98a54b3e1ff10d6ee75"}, + {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2d65ddf67fd7bf94705626871d463057d3d9a3538d41022f95b9d8f01df36e1"}, + {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d140c7b9395b4d1e654127aa1c99bcc603ed01000b7bc7e28c52562f1894ec12"}, + {file = "google_re2-1.1-3-cp39-cp39-win32.whl", hash = "sha256:80c5fc200f64b2d903eeb07b8d6cefc620a872a0240c7caaa9aca05b20f5568f"}, + {file = "google_re2-1.1-3-cp39-cp39-win_amd64.whl", hash = "sha256:9eb6dbcee9b5dc4069bbc0634f2eb039ca524a14bed5868fdf6560aaafcbca06"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0db114d7e1aa96dbcea452a40136d7d747d60cbb61394965774688ef59cccd4e"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:82133958e003a1344e5b7a791b9a9dd7560b5c8f96936dbe16f294604524a633"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:9e74fd441d1f3d917d3303e319f61b82cdbd96b9a5ba919377a6eef1504a1e2b"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:734a2e7a4541c57253b5ebee24f3f3366ba3658bcad01da25fb623c78723471a"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:d88d5eecbc908abe16132456fae13690d0508f3ac5777f320ef95cb6cab9a961"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:b91db80b171ecec435a07977a227757dd487356701a32f556fa6fca5d0a40522"}, + {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b23129887a64bb9948af14c84705273ed1a40054e99433b4acccab4dcf6a226"}, + {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dc1a0cc7cd19261dcaf76763e2499305dbb7e51dc69555167cdb8af98782698"}, + {file = "google_re2-1.1-4-cp310-cp310-win32.whl", hash = "sha256:3b2ab1e2420b5dd9743a2d6bc61b64e5f708563702a75b6db86637837eaeaf2f"}, + {file = "google_re2-1.1-4-cp310-cp310-win_amd64.whl", hash = "sha256:92efca1a7ef83b6df012d432a1cbc71d10ff42200640c0f9a5ff5b343a48e633"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:854818fd4ce79787aca5ba459d6e5abe4ca9be2c684a5b06a7f1757452ca3708"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:4ceef51174b6f653b6659a8fdaa9c38960c5228b44b25be2a3bcd8566827554f"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:ee49087c3db7e6f5238105ab5299c09e9b77516fe8cfb0a37e5f1e813d76ecb8"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:dc2312854bdc01410acc5d935f1906a49cb1f28980341c20a68797ad89d8e178"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0dc0d2e42296fa84a3cb3e1bd667c6969389cd5cdf0786e6b1f911ae2d75375b"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6bf04ced98453b035f84320f348f67578024f44d2997498def149054eb860ae8"}, + {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d6b6ef11dc4ab322fa66c2f3561925f2b5372a879c3ed764d20e939e2fd3e5f"}, + {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0dcde6646fa9a97fd3692b3f6ae7daf7f3277d7500b6c253badeefa11db8956a"}, + {file = "google_re2-1.1-4-cp311-cp311-win32.whl", hash = "sha256:5f4f0229deb057348893574d5b0a96d055abebac6debf29d95b0c0e26524c9f6"}, + {file = "google_re2-1.1-4-cp311-cp311-win_amd64.whl", hash = "sha256:4713ddbe48a18875270b36a462b0eada5e84d6826f8df7edd328d8706b6f9d07"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:40a698300b8faddbb325662973f839489c89b960087060bd389c376828978a04"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:103d2d7ac92ba23911a151fd1fc7035cbf6dc92a7f6aea92270ebceb5cd5acd3"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:51fb7182bccab05e8258a2b6a63dda1a6b4a9e8dfb9b03ec50e50c49c2827dd4"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:65383022abd63d7b620221eba7935132b53244b8b463d8fdce498c93cf58b7b7"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396281fc68a9337157b3ffcd9392c6b7fcb8aab43e5bdab496262a81d56a4ecc"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8198adcfcff1c680e052044124621730fc48d08005f90a75487f5651f1ebfce2"}, + {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81f7bff07c448aec4db9ca453d2126ece8710dbd9278b8bb09642045d3402a96"}, + {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7dacf730fd7d6ec71b11d6404b0b26e230814bfc8e9bb0d3f13bec9b5531f8d"}, + {file = "google_re2-1.1-4-cp312-cp312-win32.whl", hash = "sha256:8c764f62f4b1d89d1ef264853b6dd9fee14a89e9b86a81bc2157fe3531425eb4"}, + {file = "google_re2-1.1-4-cp312-cp312-win_amd64.whl", hash = "sha256:0be2666df4bc5381a5d693585f9bbfefb0bfd3c07530d7e403f181f5de47254a"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:5cb1b63a0bfd8dd65d39d2f3b2e5ae0a06ce4b2ce5818a1d1fc78a786a252673"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e41751ce6b67a95230edd0772226dc94c2952a2909674cd69df9804ed0125307"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:b998cfa2d50bf4c063e777c999a7e8645ec7e5d7baf43ad71b1e2e10bb0300c3"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:226ca3b0c2e970f3fc82001ac89e845ecc7a4bb7c68583e7a76cda70b61251a7"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:9adec1f734ebad7c72e56c85f205a281d8fe9bf6583bc21020157d3f2812ce89"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:9c34f3c64ba566af967d29e11299560e6fdfacd8ca695120a7062b6ed993b179"}, + {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b85385fe293838e0d0b6e19e6c48ba8c6f739ea92ce2e23b718afe7b343363"}, + {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4694daa8a8987cfb568847aa872f9990e930c91a68c892ead876411d4b9012c3"}, + {file = "google_re2-1.1-4-cp38-cp38-win32.whl", hash = "sha256:5e671e9be1668187e2995aac378de574fa40df70bb6f04657af4d30a79274ce0"}, + {file = "google_re2-1.1-4-cp38-cp38-win_amd64.whl", hash = "sha256:f66c164d6049a8299f6dfcfa52d1580576b4b9724d6fcdad2f36f8f5da9304b6"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:25cb17ae0993a48c70596f3a3ef5d659638106401cc8193f51c0d7961b3b3eb7"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:5f101f86d14ca94ca4dcf63cceaa73d351f2be2481fcaa29d9e68eeab0dc2a88"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4e82591e85bf262a6d74cff152867e05fc97867c68ba81d6836ff8b0e7e62365"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:1f61c09b93ffd34b1e2557e5a9565039f935407a5786dbad46f64f1a484166e6"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:12b390ad8c7e74bab068732f774e75e0680dade6469b249a721f3432f90edfc3"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1284343eb31c2e82ed2d8159f33ba6842238a56782c881b07845a6d85613b055"}, + {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c7b38e0daf2c06e4d3163f4c732ab3ad2521aecfed6605b69e4482c612da303"}, + {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, + {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, + {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, ] [[package]] @@ -3949,6 +4113,38 @@ cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] +[[package]] +name = "huggingface-hub" +version = "0.20.3" +description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" +optional = true +python-versions = ">=3.8.0" +files = [ + {file = "huggingface_hub-0.20.3-py3-none-any.whl", hash = "sha256:d988ae4f00d3e307b0c80c6a05ca6dbb7edba8bba3079f74cda7d9c2e562a7b6"}, + {file = "huggingface_hub-0.20.3.tar.gz", hash = "sha256:94e7f8e074475fbc67d6a71957b678e1b4a74ff1b64a644fd6cbb83da962d05d"}, +] + +[package.dependencies] +filelock = "*" +fsspec = ">=2023.5.0" +packaging = ">=20.9" +pyyaml = ">=5.1" +requests = "*" +tqdm = ">=4.42.1" +typing-extensions = ">=3.7.4.3" + +[package.extras] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.1.3)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +cli = ["InquirerPy (==0.3.4)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.1.3)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] +inference = ["aiohttp", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)"] +quality = ["mypy (==1.5.1)", "ruff (>=0.1.3)"] +tensorflow = ["graphviz", "pydot", "tensorflow"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] +torch = ["torch"] +typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] + [[package]] name = "humanfriendly" version = "10.0" @@ -4438,6 +4634,24 @@ sqlalchemy = ["sqlalchemy"] test = ["mock", "pytest", "pytest-cov (<2.6)"] zmq = ["pyzmq"] +[[package]] +name = "loguru" +version = "0.7.2" +description = "Python logging made (stupidly) simple" +optional = true +python-versions = ">=3.5" +files = [ + {file = "loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb"}, + {file = "loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac"}, +] + +[package.dependencies] +colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""} +win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""} + +[package.extras] +dev = ["Sphinx (==7.2.5)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.4.1)", "mypy (==v1.5.1)", "pre-commit (==3.4.0)", "pytest (==6.1.2)", "pytest (==7.4.0)", "pytest-cov (==2.12.1)", "pytest-cov (==4.1.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.0.0)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.3.0)", "tox (==3.27.1)", "tox (==4.11.0)"] + [[package]] name = "lxml" version = "4.9.3" @@ -4448,10 +4662,13 @@ files = [ {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"}, + {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"}, + {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"}, {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"}, + {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"}, {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"}, @@ -4460,6 +4677,7 @@ files = [ {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"}, {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"}, + {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"}, {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"}, @@ -4479,6 +4697,7 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"}, + {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"}, @@ -4488,6 +4707,7 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"}, + {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"}, @@ -4497,6 +4717,7 @@ files = [ {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"}, + {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"}, @@ -4506,6 +4727,7 @@ files = [ {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"}, {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"}, + {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"}, @@ -4516,13 +4738,16 @@ files = [ {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"}, {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"}, + {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"}, + {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"}, + {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"}, {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"}, @@ -4683,6 +4908,16 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -5468,35 +5703,36 @@ reference = ["Pillow", "google-re2"] [[package]] name = "onnxruntime" -version = "1.16.1" +version = "1.18.0" description = "ONNX Runtime is a runtime accelerator for Machine Learning models" optional = true python-versions = "*" files = [ - {file = "onnxruntime-1.16.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:28b2c7f444b4119950b69370801cd66067f403d19cbaf2a444735d7c269cce4a"}, - {file = "onnxruntime-1.16.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c24e04f33e7899f6aebb03ed51e51d346c1f906b05c5569d58ac9a12d38a2f58"}, - {file = "onnxruntime-1.16.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fa93b166f2d97063dc9f33c5118c5729a4a5dd5617296b6dbef42f9047b3e81"}, - {file = "onnxruntime-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:042dd9201b3016ee18f8f8bc4609baf11ff34ca1ff489c0a46bcd30919bf883d"}, - {file = "onnxruntime-1.16.1-cp310-cp310-win32.whl", hash = "sha256:c20aa0591f305012f1b21aad607ed96917c86ae7aede4a4dd95824b3d124ceb7"}, - {file = "onnxruntime-1.16.1-cp310-cp310-win_amd64.whl", hash = "sha256:5581873e578917bea76d6434ee7337e28195d03488dcf72d161d08e9398c6249"}, - {file = "onnxruntime-1.16.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:ef8c0c8abf5f309aa1caf35941380839dc5f7a2fa53da533be4a3f254993f120"}, - {file = "onnxruntime-1.16.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e680380bea35a137cbc3efd67a17486e96972901192ad3026ee79c8d8fe264f7"}, - {file = "onnxruntime-1.16.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e62cc38ce1a669013d0a596d984762dc9c67c56f60ecfeee0d5ad36da5863f6"}, - {file = "onnxruntime-1.16.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:025c7a4d57bd2e63b8a0f84ad3df53e419e3df1cc72d63184f2aae807b17c13c"}, - {file = "onnxruntime-1.16.1-cp311-cp311-win32.whl", hash = "sha256:9ad074057fa8d028df248b5668514088cb0937b6ac5954073b7fb9b2891ffc8c"}, - {file = "onnxruntime-1.16.1-cp311-cp311-win_amd64.whl", hash = "sha256:d5e43a3478bffc01f817ecf826de7b25a2ca1bca8547d70888594ab80a77ad24"}, - {file = "onnxruntime-1.16.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:3aef4d70b0930e29a8943eab248cd1565664458d3a62b2276bd11181f28fd0a3"}, - {file = "onnxruntime-1.16.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:55a7b843a57c8ca0c8ff169428137958146081d5d76f1a6dd444c4ffcd37c3c2"}, - {file = "onnxruntime-1.16.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62c631af1941bf3b5f7d063d24c04aacce8cff0794e157c497e315e89ac5ad7b"}, - {file = "onnxruntime-1.16.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5671f296c3d5c233f601e97a10ab5a1dd8e65ba35c7b7b0c253332aba9dff330"}, - {file = "onnxruntime-1.16.1-cp38-cp38-win32.whl", hash = "sha256:eb3802305023dd05e16848d4e22b41f8147247894309c0c27122aaa08793b3d2"}, - {file = "onnxruntime-1.16.1-cp38-cp38-win_amd64.whl", hash = "sha256:fecfb07443d09d271b1487f401fbdf1ba0c829af6fd4fe8f6af25f71190e7eb9"}, - {file = "onnxruntime-1.16.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:de3e12094234db6545c67adbf801874b4eb91e9f299bda34c62967ef0050960f"}, - {file = "onnxruntime-1.16.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ff723c2a5621b5e7103f3be84d5aae1e03a20621e72219dddceae81f65f240af"}, - {file = "onnxruntime-1.16.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14a7fb3073aaf6b462e3d7fb433320f7700558a8892e5021780522dc4574292a"}, - {file = "onnxruntime-1.16.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:963159f1f699b0454cd72fcef3276c8a1aab9389a7b301bcd8e320fb9d9e8597"}, - {file = "onnxruntime-1.16.1-cp39-cp39-win32.whl", hash = "sha256:85771adb75190db9364b25ddec353ebf07635b83eb94b64ed014f1f6d57a3857"}, - {file = "onnxruntime-1.16.1-cp39-cp39-win_amd64.whl", hash = "sha256:d32d2b30799c1f950123c60ae8390818381fd5f88bdf3627eeca10071c155dc5"}, + {file = "onnxruntime-1.18.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:5a3b7993a5ecf4a90f35542a4757e29b2d653da3efe06cdd3164b91167bbe10d"}, + {file = "onnxruntime-1.18.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15b944623b2cdfe7f7945690bfb71c10a4531b51997c8320b84e7b0bb59af902"}, + {file = "onnxruntime-1.18.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e61ce5005118064b1a0ed73ebe936bc773a102f067db34108ea6c64dd62a179"}, + {file = "onnxruntime-1.18.0-cp310-cp310-win32.whl", hash = "sha256:a4fc8a2a526eb442317d280610936a9f73deece06c7d5a91e51570860802b93f"}, + {file = "onnxruntime-1.18.0-cp310-cp310-win_amd64.whl", hash = "sha256:71ed219b768cab004e5cd83e702590734f968679bf93aa488c1a7ffbe6e220c3"}, + {file = "onnxruntime-1.18.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:3d24bd623872a72a7fe2f51c103e20fcca2acfa35d48f2accd6be1ec8633d960"}, + {file = "onnxruntime-1.18.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f15e41ca9b307a12550bfd2ec93f88905d9fba12bab7e578f05138ad0ae10d7b"}, + {file = "onnxruntime-1.18.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f45ca2887f62a7b847d526965686b2923efa72538c89b7703c7b3fe970afd59"}, + {file = "onnxruntime-1.18.0-cp311-cp311-win32.whl", hash = "sha256:9e24d9ecc8781323d9e2eeda019b4b24babc4d624e7d53f61b1fe1a929b0511a"}, + {file = "onnxruntime-1.18.0-cp311-cp311-win_amd64.whl", hash = "sha256:f8608398976ed18aef450d83777ff6f77d0b64eced1ed07a985e1a7db8ea3771"}, + {file = "onnxruntime-1.18.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f1d79941f15fc40b1ee67738b2ca26b23e0181bf0070b5fb2984f0988734698f"}, + {file = "onnxruntime-1.18.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99e8caf3a8565c853a22d323a3eebc2a81e3de7591981f085a4f74f7a60aab2d"}, + {file = "onnxruntime-1.18.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:498d2b8380635f5e6ebc50ec1b45f181588927280f32390fb910301d234f97b8"}, + {file = "onnxruntime-1.18.0-cp312-cp312-win32.whl", hash = "sha256:ba7cc0ce2798a386c082aaa6289ff7e9bedc3dee622eef10e74830cff200a72e"}, + {file = "onnxruntime-1.18.0-cp312-cp312-win_amd64.whl", hash = "sha256:1fa175bd43f610465d5787ae06050c81f7ce09da2bf3e914eb282cb8eab363ef"}, + {file = "onnxruntime-1.18.0-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:0284c579c20ec8b1b472dd190290a040cc68b6caec790edb960f065d15cf164a"}, + {file = "onnxruntime-1.18.0-cp38-cp38-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d47353d036d8c380558a5643ea5f7964d9d259d31c86865bad9162c3e916d1f6"}, + {file = "onnxruntime-1.18.0-cp38-cp38-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:885509d2b9ba4b01f08f7fa28d31ee54b6477953451c7ccf124a84625f07c803"}, + {file = "onnxruntime-1.18.0-cp38-cp38-win32.whl", hash = "sha256:8614733de3695656411d71fc2f39333170df5da6c7efd6072a59962c0bc7055c"}, + {file = "onnxruntime-1.18.0-cp38-cp38-win_amd64.whl", hash = "sha256:47af3f803752fce23ea790fd8d130a47b2b940629f03193f780818622e856e7a"}, + {file = "onnxruntime-1.18.0-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:9153eb2b4d5bbab764d0aea17adadffcfc18d89b957ad191b1c3650b9930c59f"}, + {file = "onnxruntime-1.18.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2c7fd86eca727c989bb8d9c5104f3c45f7ee45f445cc75579ebe55d6b99dfd7c"}, + {file = "onnxruntime-1.18.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ac67a4de9c1326c4d87bcbfb652c923039b8a2446bb28516219236bec3b494f5"}, + {file = "onnxruntime-1.18.0-cp39-cp39-win32.whl", hash = "sha256:6ffb445816d06497df7a6dd424b20e0b2c39639e01e7fe210e247b82d15a23b9"}, + {file = "onnxruntime-1.18.0-cp39-cp39-win_amd64.whl", hash = "sha256:46de6031cb6745f33f7eca9e51ab73e8c66037fb7a3b6b4560887c5b55ab5d5d"}, ] [package.dependencies] @@ -6717,6 +6953,7 @@ files = [ {file = "pymongo-4.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8"}, {file = "pymongo-4.6.0-cp312-cp312-win32.whl", hash = "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4"}, {file = "pymongo-4.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330"}, + {file = "pymongo-4.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e24025625bad66895b1bc3ae1647f48f0a92dd014108fb1be404c77f0b69ca67"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651"}, @@ -7157,6 +7394,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -7164,8 +7402,16 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -7182,6 +7428,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -7189,6 +7436,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -7196,30 +7444,30 @@ files = [ [[package]] name = "qdrant-client" -version = "1.6.4" +version = "1.9.1" description = "Client library for the Qdrant vector search engine" optional = true -python-versions = ">=3.8,<3.13" +python-versions = ">=3.8" files = [ - {file = "qdrant_client-1.6.4-py3-none-any.whl", hash = "sha256:db4696978d6a62d78ff60f70b912383f1e467bda3053f732b01ddb5f93281b10"}, - {file = "qdrant_client-1.6.4.tar.gz", hash = "sha256:bbd65f383b6a55a9ccf4e301250fa925179340dd90cfde9b93ce4230fd68867b"}, + {file = "qdrant_client-1.9.1-py3-none-any.whl", hash = "sha256:b9b7e0e5c1a51410d8bb5106a869a51e12f92ab45a99030f27aba790553bd2c8"}, + {file = "qdrant_client-1.9.1.tar.gz", hash = "sha256:186b9c31d95aefe8f2db84b7746402d7365bd63b305550e530e31bde2002ce79"}, ] [package.dependencies] -fastembed = {version = "0.1.1", optional = true, markers = "python_version < \"3.12\" and extra == \"fastembed\""} +fastembed = {version = "0.2.6", optional = true, markers = "python_version < \"3.13\" and extra == \"fastembed\""} grpcio = ">=1.41.0" grpcio-tools = ">=1.41.0" -httpx = {version = ">=0.14.0", extras = ["http2"]} +httpx = {version = ">=0.20.0", extras = ["http2"]} numpy = [ {version = ">=1.21", markers = "python_version >= \"3.8\" and python_version < \"3.12\""}, {version = ">=1.26", markers = "python_version >= \"3.12\""}, ] portalocker = ">=2.7.0,<3.0.0" pydantic = ">=1.10.8" -urllib3 = ">=1.26.14,<2.0.0" +urllib3 = ">=1.26.14,<3" [package.extras] -fastembed = ["fastembed (==0.1.1)"] +fastembed = ["fastembed (==0.2.6)"] [[package]] name = "ratelimiter" @@ -8112,6 +8360,7 @@ files = [ {file = "SQLAlchemy-1.4.49-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:03db81b89fe7ef3857b4a00b63dedd632d6183d4ea5a31c5d8a92e000a41fc71"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:95b9df9afd680b7a3b13b38adf6e3a38995da5e162cc7524ef08e3be4e5ed3e1"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a63e43bf3f668c11bb0444ce6e809c1227b8f067ca1068898f3008a273f52b09"}, + {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca46de16650d143a928d10842939dab208e8d8c3a9a8757600cae9b7c579c5cd"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f835c050ebaa4e48b18403bed2c0fda986525896efd76c245bdd4db995e51a4c"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c21b172dfb22e0db303ff6419451f0cac891d2e911bb9fbf8003d717f1bcf91"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-win32.whl", hash = "sha256:5fb1ebdfc8373b5a291485757bd6431de8d7ed42c27439f543c81f6c8febd729"}, @@ -8121,26 +8370,35 @@ files = [ {file = "SQLAlchemy-1.4.49-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5debe7d49b8acf1f3035317e63d9ec8d5e4d904c6e75a2a9246a119f5f2fdf3d"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win32.whl", hash = "sha256:82b08e82da3756765c2e75f327b9bf6b0f043c9c3925fb95fb51e1567fa4ee87"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win_amd64.whl", hash = "sha256:171e04eeb5d1c0d96a544caf982621a1711d078dbc5c96f11d6469169bd003f1"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f23755c384c2969ca2f7667a83f7c5648fcf8b62a3f2bbd883d805454964a800"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8396e896e08e37032e87e7fbf4a15f431aa878c286dc7f79e616c2feacdb366c"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66da9627cfcc43bbdebd47bfe0145bb662041472393c03b7802253993b6b7c90"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-win32.whl", hash = "sha256:9a06e046ffeb8a484279e54bda0a5abfd9675f594a2e38ef3133d7e4d75b6214"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-win_amd64.whl", hash = "sha256:7cf8b90ad84ad3a45098b1c9f56f2b161601e4670827d6b892ea0e884569bd1d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:36e58f8c4fe43984384e3fbe6341ac99b6b4e083de2fe838f0fdb91cebe9e9cb"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b31e67ff419013f99ad6f8fc73ee19ea31585e1e9fe773744c0f3ce58c039c30"}, + {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc22807a7e161c0d8f3da34018ab7c97ef6223578fcdd99b1d3e7ed1100a5db"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c14b29d9e1529f99efd550cd04dbb6db6ba5d690abb96d52de2bff4ed518bc95"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c40f3470e084d31247aea228aa1c39bbc0904c2b9ccbf5d3cfa2ea2dac06f26d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win32.whl", hash = "sha256:706bfa02157b97c136547c406f263e4c6274a7b061b3eb9742915dd774bbc264"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win_amd64.whl", hash = "sha256:a7f7b5c07ae5c0cfd24c2db86071fb2a3d947da7bd487e359cc91e67ac1c6d2e"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:4afbbf5ef41ac18e02c8dc1f86c04b22b7a2125f2a030e25bbb4aff31abb224b"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24e300c0c2147484a002b175f4e1361f102e82c345bf263242f0449672a4bccf"}, + {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393cd06c3b00b57f5421e2133e088df9cabcececcea180327e43b937b5a7caa5"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:201de072b818f8ad55c80d18d1a788729cccf9be6d9dc3b9d8613b053cd4836d"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653ed6817c710d0c95558232aba799307d14ae084cc9b1f4c389157ec50df5c"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win32.whl", hash = "sha256:647e0b309cb4512b1f1b78471fdaf72921b6fa6e750b9f891e09c6e2f0e5326f"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win_amd64.whl", hash = "sha256:ab73ed1a05ff539afc4a7f8cf371764cdf79768ecb7d2ec691e3ff89abbc541e"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:37ce517c011560d68f1ffb28af65d7e06f873f191eb3a73af5671e9c3fada08a"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1878ce508edea4a879015ab5215546c444233881301e97ca16fe251e89f1c55"}, + {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95ab792ca493891d7a45a077e35b418f68435efb3e1706cb8155e20e86a9013c"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e8e608983e6f85d0852ca61f97e521b62e67969e6e640fe6c6b575d4db68557"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf956da45290df6e809ea12c54c02ace7f8ff4d765d6d3dfb3655ee876ce58d"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win32.whl", hash = "sha256:f167c8175ab908ce48bd6550679cc6ea20ae169379e73c7720a28f89e53aa532"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win_amd64.whl", hash = "sha256:45806315aae81a0c202752558f0df52b42d11dd7ba0097bf71e253b4215f34f4"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:b6d0c4b15d65087738a6e22e0ff461b407533ff65a73b818089efc8eb2b3e1de"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a843e34abfd4c797018fd8d00ffffa99fd5184c421f190b6ca99def4087689bd"}, + {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738d7321212941ab19ba2acf02a68b8ee64987b248ffa2101630e8fccb549e0d"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c890421651b45a681181301b3497e4d57c0d01dc001e10438a40e9a9c25ee77"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26f280b8f0a8f497bc10573849ad6dc62e671d2468826e5c748d04ed9e670d5"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-win32.whl", hash = "sha256:ec2268de67f73b43320383947e74700e95c6770d0c68c4e615e9897e46296294"}, @@ -8370,56 +8628,129 @@ twisted = ["twisted"] [[package]] name = "tokenizers" -version = "0.13.3" -description = "Fast and Customizable Tokenizers" +version = "0.15.2" +description = "" optional = true -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:f3835c5be51de8c0a092058a4d4380cb9244fb34681fd0a295fbf0a52a5fdf33"}, - {file = "tokenizers-0.13.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4ef4c3e821730f2692489e926b184321e887f34fb8a6b80b8096b966ba663d07"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5fd1a6a25353e9aa762e2aae5a1e63883cad9f4e997c447ec39d071020459bc"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee0b1b311d65beab83d7a41c56a1e46ab732a9eed4460648e8eb0bd69fc2d059"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ef4215284df1277dadbcc5e17d4882bda19f770d02348e73523f7e7d8b8d396"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4d53976079cff8a033f778fb9adca2d9d69d009c02fa2d71a878b5f3963ed30"}, - {file = "tokenizers-0.13.3-cp310-cp310-win32.whl", hash = "sha256:1f0e3b4c2ea2cd13238ce43548959c118069db7579e5d40ec270ad77da5833ce"}, - {file = "tokenizers-0.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:89649c00d0d7211e8186f7a75dfa1db6996f65edce4b84821817eadcc2d3c79e"}, - {file = "tokenizers-0.13.3-cp311-cp311-macosx_10_11_universal2.whl", hash = "sha256:56b726e0d2bbc9243872b0144515ba684af5b8d8cd112fb83ee1365e26ec74c8"}, - {file = "tokenizers-0.13.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc5c022ce692e1f499d745af293ab9ee6f5d92538ed2faf73f9708c89ee59ce6"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f55c981ac44ba87c93e847c333e58c12abcbb377a0c2f2ef96e1a266e4184ff2"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f247eae99800ef821a91f47c5280e9e9afaeed9980fc444208d5aa6ba69ff148"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b3e3215d048e94f40f1c95802e45dcc37c5b05eb46280fc2ccc8cd351bff839"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ba2b0bf01777c9b9bc94b53764d6684554ce98551fec496f71bc5be3a03e98b"}, - {file = "tokenizers-0.13.3-cp311-cp311-win32.whl", hash = "sha256:cc78d77f597d1c458bf0ea7c2a64b6aa06941c7a99cb135b5969b0278824d808"}, - {file = "tokenizers-0.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:ecf182bf59bd541a8876deccf0360f5ae60496fd50b58510048020751cf1724c"}, - {file = "tokenizers-0.13.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:0527dc5436a1f6bf2c0327da3145687d3bcfbeab91fed8458920093de3901b44"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07cbb2c307627dc99b44b22ef05ff4473aa7c7cc1fec8f0a8b37d8a64b1a16d2"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4560dbdeaae5b7ee0d4e493027e3de6d53c991b5002d7ff95083c99e11dd5ac0"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64064bd0322405c9374305ab9b4c07152a1474370327499911937fd4a76d004b"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8c6e2ab0f2e3d939ca66aa1d596602105fe33b505cd2854a4c1717f704c51de"}, - {file = "tokenizers-0.13.3-cp37-cp37m-win32.whl", hash = "sha256:6cc29d410768f960db8677221e497226e545eaaea01aa3613fa0fdf2cc96cff4"}, - {file = "tokenizers-0.13.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fc2a7fdf864554a0dacf09d32e17c0caa9afe72baf9dd7ddedc61973bae352d8"}, - {file = "tokenizers-0.13.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8791dedba834c1fc55e5f1521be325ea3dafb381964be20684b92fdac95d79b7"}, - {file = "tokenizers-0.13.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:d607a6a13718aeb20507bdf2b96162ead5145bbbfa26788d6b833f98b31b26e1"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3791338f809cd1bf8e4fee6b540b36822434d0c6c6bc47162448deee3f77d425"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2f35f30e39e6aab8716f07790f646bdc6e4a853816cc49a95ef2a9016bf9ce6"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310204dfed5aa797128b65d63538a9837cbdd15da2a29a77d67eefa489edda26"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0f9b92ea052305166559f38498b3b0cae159caea712646648aaa272f7160963"}, - {file = "tokenizers-0.13.3-cp38-cp38-win32.whl", hash = "sha256:9a3fa134896c3c1f0da6e762d15141fbff30d094067c8f1157b9fdca593b5806"}, - {file = "tokenizers-0.13.3-cp38-cp38-win_amd64.whl", hash = "sha256:8e7b0cdeace87fa9e760e6a605e0ae8fc14b7d72e9fc19c578116f7287bb873d"}, - {file = "tokenizers-0.13.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:00cee1e0859d55507e693a48fa4aef07060c4bb6bd93d80120e18fea9371c66d"}, - {file = "tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a23ff602d0797cea1d0506ce69b27523b07e70f6dda982ab8cf82402de839088"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70ce07445050b537d2696022dafb115307abdffd2a5c106f029490f84501ef97"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:280ffe95f50eaaf655b3a1dc7ff1d9cf4777029dbbc3e63a74e65a056594abc3"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97acfcec592f7e9de8cadcdcda50a7134423ac8455c0166b28c9ff04d227b371"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd7730c98a3010cd4f523465867ff95cd9d6430db46676ce79358f65ae39797b"}, - {file = "tokenizers-0.13.3-cp39-cp39-win32.whl", hash = "sha256:48625a108029cb1ddf42e17a81b5a3230ba6888a70c9dc14e81bc319e812652d"}, - {file = "tokenizers-0.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:bc0a6f1ba036e482db6453571c9e3e60ecd5489980ffd95d11dc9f960483d783"}, - {file = "tokenizers-0.13.3.tar.gz", hash = "sha256:2e546dbb68b623008a5442353137fbb0123d311a6d7ba52f2667c8862a75af2e"}, -] - -[package.extras] -dev = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] -docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"] + {file = "tokenizers-0.15.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:52f6130c9cbf70544287575a985bf44ae1bda2da7e8c24e97716080593638012"}, + {file = "tokenizers-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:054c1cc9c6d68f7ffa4e810b3d5131e0ba511b6e4be34157aa08ee54c2f8d9ee"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9b9b070fdad06e347563b88c278995735292ded1132f8657084989a4c84a6d5"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea621a7eef4b70e1f7a4e84dd989ae3f0eeb50fc8690254eacc08acb623e82f1"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf7fd9a5141634fa3aa8d6b7be362e6ae1b4cda60da81388fa533e0b552c98fd"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44f2a832cd0825295f7179eaf173381dc45230f9227ec4b44378322d900447c9"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b9ec69247a23747669ec4b0ca10f8e3dfb3545d550258129bd62291aabe8605"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b6a4c78da863ff26dbd5ad9a8ecc33d8a8d97b535172601cf00aee9d7ce9ce"}, + {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5ab2a4d21dcf76af60e05af8063138849eb1d6553a0d059f6534357bce8ba364"}, + {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a47acfac7e511f6bbfcf2d3fb8c26979c780a91e06fb5b9a43831b2c0153d024"}, + {file = "tokenizers-0.15.2-cp310-none-win32.whl", hash = "sha256:064ff87bb6acdbd693666de9a4b692add41308a2c0ec0770d6385737117215f2"}, + {file = "tokenizers-0.15.2-cp310-none-win_amd64.whl", hash = "sha256:3b919afe4df7eb6ac7cafd2bd14fb507d3f408db7a68c43117f579c984a73843"}, + {file = "tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:89cd1cb93e4b12ff39bb2d626ad77e35209de9309a71e4d3d4672667b4b256e7"}, + {file = "tokenizers-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cfed5c64e5be23d7ee0f0e98081a25c2a46b0b77ce99a4f0605b1ec43dd481fa"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a907d76dcfda37023ba203ab4ceeb21bc5683436ebefbd895a0841fd52f6f6f2"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ea60479de6fc7b8ae756b4b097572372d7e4032e2521c1bbf3d90c90a99ff0"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:48e2b9335be2bc0171df9281385c2ed06a15f5cf121c44094338306ab7b33f2c"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:112a1dd436d2cc06e6ffdc0b06d55ac019a35a63afd26475205cb4b1bf0bfbff"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4620cca5c2817177ee8706f860364cc3a8845bc1e291aaf661fb899e5d1c45b0"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd73a82751c523b3fc31ff8194702e4af4db21dc20e55b30ecc2079c5d43cb7"}, + {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:107089f135b4ae7817affe6264f8c7a5c5b4fd9a90f9439ed495f54fcea56fb4"}, + {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0ff110ecc57b7aa4a594396525a3451ad70988e517237fe91c540997c4e50e29"}, + {file = "tokenizers-0.15.2-cp311-none-win32.whl", hash = "sha256:6d76f00f5c32da36c61f41c58346a4fa7f0a61be02f4301fd30ad59834977cc3"}, + {file = "tokenizers-0.15.2-cp311-none-win_amd64.whl", hash = "sha256:cc90102ed17271cf0a1262babe5939e0134b3890345d11a19c3145184b706055"}, + {file = "tokenizers-0.15.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f86593c18d2e6248e72fb91c77d413a815153b8ea4e31f7cd443bdf28e467670"}, + {file = "tokenizers-0.15.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0774bccc6608eca23eb9d620196687c8b2360624619623cf4ba9dc9bd53e8b51"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d0222c5b7c9b26c0b4822a82f6a7011de0a9d3060e1da176f66274b70f846b98"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3835738be1de66624fff2f4f6f6684775da4e9c00bde053be7564cbf3545cc66"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0143e7d9dcd811855c1ce1ab9bf5d96d29bf5e528fd6c7824d0465741e8c10fd"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db35825f6d54215f6b6009a7ff3eedee0848c99a6271c870d2826fbbedf31a38"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f5e64b0389a2be47091d8cc53c87859783b837ea1a06edd9d8e04004df55a5c"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e0480c452217edd35eca56fafe2029fb4d368b7c0475f8dfa3c5c9c400a7456"}, + {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a33ab881c8fe70474980577e033d0bc9a27b7ab8272896e500708b212995d834"}, + {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a308a607ca9de2c64c1b9ba79ec9a403969715a1b8ba5f998a676826f1a7039d"}, + {file = "tokenizers-0.15.2-cp312-none-win32.whl", hash = "sha256:b8fcfa81bcb9447df582c5bc96a031e6df4da2a774b8080d4f02c0c16b42be0b"}, + {file = "tokenizers-0.15.2-cp312-none-win_amd64.whl", hash = "sha256:38d7ab43c6825abfc0b661d95f39c7f8af2449364f01d331f3b51c94dcff7221"}, + {file = "tokenizers-0.15.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:38bfb0204ff3246ca4d5e726e8cc8403bfc931090151e6eede54d0e0cf162ef0"}, + {file = "tokenizers-0.15.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c861d35e8286a53e06e9e28d030b5a05bcbf5ac9d7229e561e53c352a85b1fc"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:936bf3842db5b2048eaa53dade907b1160f318e7c90c74bfab86f1e47720bdd6"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:620beacc3373277700d0e27718aa8b25f7b383eb8001fba94ee00aeea1459d89"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2735ecbbf37e52db4ea970e539fd2d450d213517b77745114f92867f3fc246eb"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:473c83c5e2359bb81b0b6fde870b41b2764fcdd36d997485e07e72cc3a62264a"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968fa1fb3c27398b28a4eca1cbd1e19355c4d3a6007f7398d48826bbe3a0f728"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:865c60ae6eaebdde7da66191ee9b7db52e542ed8ee9d2c653b6d190a9351b980"}, + {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7c0d8b52664ab2d4a8d6686eb5effc68b78608a9008f086a122a7b2996befbab"}, + {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f33dfbdec3784093a9aebb3680d1f91336c56d86cc70ddf88708251da1fe9064"}, + {file = "tokenizers-0.15.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:d44ba80988ff9424e33e0a49445072ac7029d8c0e1601ad25a0ca5f41ed0c1d6"}, + {file = "tokenizers-0.15.2-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:dce74266919b892f82b1b86025a613956ea0ea62a4843d4c4237be2c5498ed3a"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ef06b9707baeb98b316577acb04f4852239d856b93e9ec3a299622f6084e4be"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73e2e74bbb07910da0d37c326869f34113137b23eadad3fc00856e6b3d9930c"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4eeb12daf02a59e29f578a865f55d87cd103ce62bd8a3a5874f8fdeaa82e336b"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ba9f6895af58487ca4f54e8a664a322f16c26bbb442effd01087eba391a719e"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccec77aa7150e38eec6878a493bf8c263ff1fa8a62404e16c6203c64c1f16a26"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3f40604f5042ff210ba82743dda2b6aa3e55aa12df4e9f2378ee01a17e2855e"}, + {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5645938a42d78c4885086767c70923abad047163d809c16da75d6b290cb30bbe"}, + {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05a77cbfebe28a61ab5c3891f9939cc24798b63fa236d84e5f29f3a85a200c00"}, + {file = "tokenizers-0.15.2-cp37-none-win32.whl", hash = "sha256:361abdc068e8afe9c5b818769a48624687fb6aaed49636ee39bec4e95e1a215b"}, + {file = "tokenizers-0.15.2-cp37-none-win_amd64.whl", hash = "sha256:7ef789f83eb0f9baeb4d09a86cd639c0a5518528f9992f38b28e819df397eb06"}, + {file = "tokenizers-0.15.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4fe1f74a902bee74a3b25aff180fbfbf4f8b444ab37c4d496af7afd13a784ed2"}, + {file = "tokenizers-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4c4b89038a684f40a6b15d6b09f49650ac64d951ad0f2a3ea9169687bbf2a8ba"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d05a1b06f986d41aed5f2de464c003004b2df8aaf66f2b7628254bcbfb72a438"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508711a108684111ec8af89d3a9e9e08755247eda27d0ba5e3c50e9da1600f6d"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:daa348f02d15160cb35439098ac96e3a53bacf35885072611cd9e5be7d333daa"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:494fdbe5932d3416de2a85fc2470b797e6f3226c12845cadf054dd906afd0442"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2d60f5246f4da9373f75ff18d64c69cbf60c3bca597290cea01059c336d2470"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93268e788825f52de4c7bdcb6ebc1fcd4a5442c02e730faa9b6b08f23ead0e24"}, + {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6fc7083ab404019fc9acafe78662c192673c1e696bd598d16dc005bd663a5cf9"}, + {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e39b41e5531d6b2122a77532dbea60e171ef87a3820b5a3888daa847df4153"}, + {file = "tokenizers-0.15.2-cp38-none-win32.whl", hash = "sha256:06cd0487b1cbfabefb2cc52fbd6b1f8d4c37799bd6c6e1641281adaa6b2504a7"}, + {file = "tokenizers-0.15.2-cp38-none-win_amd64.whl", hash = "sha256:5179c271aa5de9c71712e31cb5a79e436ecd0d7532a408fa42a8dbfa4bc23fd9"}, + {file = "tokenizers-0.15.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:82f8652a74cc107052328b87ea8b34291c0f55b96d8fb261b3880216a9f9e48e"}, + {file = "tokenizers-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:02458bee6f5f3139f1ebbb6d042b283af712c0981f5bc50edf771d6b762d5e4f"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c9a09cd26cca2e1c349f91aa665309ddb48d71636370749414fbf67bc83c5343"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:158be8ea8554e5ed69acc1ce3fbb23a06060bd4bbb09029431ad6b9a466a7121"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ddba9a2b0c8c81633eca0bb2e1aa5b3a15362b1277f1ae64176d0f6eba78ab1"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ef5dd1d39797044642dbe53eb2bc56435308432e9c7907728da74c69ee2adca"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:454c203164e07a860dbeb3b1f4a733be52b0edbb4dd2e5bd75023ffa8b49403a"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cf6b7f1d4dc59af960e6ffdc4faffe6460bbfa8dce27a58bf75755ffdb2526d"}, + {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2ef09bbc16519f6c25d0c7fc0c6a33a6f62923e263c9d7cca4e58b8c61572afb"}, + {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c9a2ebdd2ad4ec7a68e7615086e633857c85e2f18025bd05d2a4399e6c5f7169"}, + {file = "tokenizers-0.15.2-cp39-none-win32.whl", hash = "sha256:918fbb0eab96fe08e72a8c2b5461e9cce95585d82a58688e7f01c2bd546c79d0"}, + {file = "tokenizers-0.15.2-cp39-none-win_amd64.whl", hash = "sha256:524e60da0135e106b254bd71f0659be9f89d83f006ea9093ce4d1fab498c6d0d"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6a9b648a58281c4672212fab04e60648fde574877d0139cd4b4f93fe28ca8944"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7c7d18b733be6bbca8a55084027f7be428c947ddf871c500ee603e375013ffba"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:13ca3611de8d9ddfbc4dc39ef54ab1d2d4aaa114ac8727dfdc6a6ec4be017378"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:237d1bf3361cf2e6463e6c140628e6406766e8b27274f5fcc62c747ae3c6f094"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67a0fe1e49e60c664915e9fb6b0cb19bac082ab1f309188230e4b2920230edb3"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e022fe65e99230b8fd89ebdfea138c24421f91c1a4f4781a8f5016fd5cdfb4d"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d857be2df69763362ac699f8b251a8cd3fac9d21893de129bc788f8baaef2693"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:708bb3e4283177236309e698da5fcd0879ce8fd37457d7c266d16b550bcbbd18"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c35e09e9899b72a76e762f9854e8750213f67567787d45f37ce06daf57ca78"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1257f4394be0d3b00de8c9e840ca5601d0a4a8438361ce9c2b05c7d25f6057b"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02272fe48280e0293a04245ca5d919b2c94a48b408b55e858feae9618138aeda"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dc3ad9ebc76eabe8b1d7c04d38be884b8f9d60c0cdc09b0aa4e3bcf746de0388"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:32e16bdeffa7c4f46bf2152172ca511808b952701d13e7c18833c0b73cb5c23f"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fb16ba563d59003028b678d2361a27f7e4ae0ab29c7a80690efa20d829c81fdb"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:2277c36d2d6cdb7876c274547921a42425b6810d38354327dd65a8009acf870c"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1cf75d32e8d250781940d07f7eece253f2fe9ecdb1dc7ba6e3833fa17b82fcbc"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1b3b31884dc8e9b21508bb76da80ebf7308fdb947a17affce815665d5c4d028"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b10122d8d8e30afb43bb1fe21a3619f62c3e2574bff2699cf8af8b0b6c5dc4a3"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d88b96ff0fe8e91f6ef01ba50b0d71db5017fa4e3b1d99681cec89a85faf7bf7"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:37aaec5a52e959892870a7c47cef80c53797c0db9149d458460f4f31e2fb250e"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e2ea752f2b0fe96eb6e2f3adbbf4d72aaa1272079b0dfa1145507bd6a5d537e6"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:4b19a808d8799fda23504a5cd31d2f58e6f52f140380082b352f877017d6342b"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c86e5e068ac8b19204419ed8ca90f9d25db20578f5881e337d203b314f4104"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de19c4dc503c612847edf833c82e9f73cd79926a384af9d801dcf93f110cea4e"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea09acd2fe3324174063d61ad620dec3bcf042b495515f27f638270a7d466e8b"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cf27fd43472e07b57cf420eee1e814549203d56de00b5af8659cb99885472f1f"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7ca22bd897537a0080521445d91a58886c8c04084a6a19e6c78c586e0cfa92a5"}, + {file = "tokenizers-0.15.2.tar.gz", hash = "sha256:e6e9c6e019dd5484be5beafc775ae6c925f4c69a3487040ed09b45e13df2cb91"}, +] + +[package.dependencies] +huggingface_hub = ">=0.16.4,<1.0" + +[package.extras] +dev = ["tokenizers[testing]"] +docs = ["setuptools_rust", "sphinx", "sphinx_rtd_theme"] testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] [[package]] @@ -8933,6 +9264,20 @@ files = [ {file = "win_precise_time-1.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:3f510fa92d9c39ea533c983e1d62c7bc66fdf0a3e3c3bdda48d4ebb634ff7034"}, ] +[[package]] +name = "win32-setctime" +version = "1.1.0" +description = "A small Python utility to set file creation time on Windows" +optional = true +python-versions = ">=3.5" +files = [ + {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"}, + {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"}, +] + +[package.extras] +dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"] + [[package]] name = "wrapt" version = "1.15.0" @@ -9238,4 +9583,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "4ca5f4a7955437d6da09be909a729172b9a663cc0649227e6088dc1c2cd27e57" +content-hash = "920869be38d3b82c2c62e77d814c43be62b71ed73d68d1c570ac70886d439b91" diff --git a/pyproject.toml b/pyproject.toml index b99c9e4051..c7cda5a994 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,9 +73,9 @@ pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} adlfs = {version = ">=2022.4.0", optional = true} -pyodbc = {version = "^4.0.39", optional = true} -qdrant-client = {version = "^1.6.4", optional = true, extras = ["fastembed"]} -databricks-sql-connector = {version = ">=3", optional = true} +pyodbc = {version = ">=4.0.39", optional = true} +qdrant-client = {version = ">=1.8", optional = true, extras = ["fastembed"]} +databricks-sql-connector = {version = ">=2.9.3", optional = true} clickhouse-driver = { version = ">=0.2.7", optional = true } clickhouse-connect = { version = ">=0.7.7", optional = true } deltalake = { version = ">=0.17.4", optional = true } diff --git a/tests/common/cases/destinations/null.py b/tests/common/cases/destinations/null.py index b2054cd7e8..37e87d89cf 100644 --- a/tests/common/cases/destinations/null.py +++ b/tests/common/cases/destinations/null.py @@ -14,7 +14,7 @@ def __init__(self, **kwargs: Any) -> None: spec = DestinationClientConfiguration - def capabilities(self) -> DestinationCapabilitiesContext: + def _raw_capabilities(self) -> DestinationCapabilitiesContext: return DestinationCapabilitiesContext.generic_capabilities() @property diff --git a/tests/common/cases/normalizers/__init__.py b/tests/common/cases/normalizers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/common/cases/normalizers/sql_upper.py b/tests/common/cases/normalizers/sql_upper.py new file mode 100644 index 0000000000..992940d9a2 --- /dev/null +++ b/tests/common/cases/normalizers/sql_upper.py @@ -0,0 +1,20 @@ +from typing import Any, Sequence + +from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention + + +class NamingConvention(BaseNamingConvention): + PATH_SEPARATOR = "__" + + _CLEANUP_TABLE = str.maketrans(".\n\r'\"▶", "______") + + def normalize_identifier(self, identifier: str) -> str: + identifier = super().normalize_identifier(identifier) + norm_identifier = identifier.translate(self._CLEANUP_TABLE).upper() + return self.shorten_identifier(norm_identifier, identifier, self.max_length) + + def make_path(self, *identifiers: Any) -> str: + return self.PATH_SEPARATOR.join(filter(lambda x: x.strip(), identifiers)) + + def break_path(self, path: str) -> Sequence[str]: + return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] diff --git a/tests/common/cases/normalizers/title_case.py b/tests/common/cases/normalizers/title_case.py new file mode 100644 index 0000000000..41eb96fcda --- /dev/null +++ b/tests/common/cases/normalizers/title_case.py @@ -0,0 +1,14 @@ +from dlt.common.normalizers.naming.direct import NamingConvention as DirectNamingConvention + + +class NamingConvention(DirectNamingConvention): + """Test case sensitive naming that capitalizes first and last letter and leaves the rest intact""" + + PATH_SEPARATOR = "__" + + def normalize_identifier(self, identifier: str) -> str: + # keep prefix + if identifier == "_dlt": + return "_dlt" + identifier = super().normalize_identifier(identifier) + return identifier[0].upper() + identifier[1:-1] + identifier[-1].upper() diff --git a/tests/common/configuration/test_inject.py b/tests/common/configuration/test_inject.py index f0494e9898..13d68b53e9 100644 --- a/tests/common/configuration/test_inject.py +++ b/tests/common/configuration/test_inject.py @@ -570,7 +570,19 @@ def get_cf(aux: str = dlt.config.value, last_config: AuxTest = None): def test_inject_spec_into_argument_with_spec_type() -> None: # if signature contains argument with type of SPEC, it gets injected there - from dlt.destinations.impl.dummy import _configure, DummyClientConfiguration + import dlt + from dlt.common.configuration import known_sections + from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration + + @with_config( + spec=DummyClientConfiguration, + sections=( + known_sections.DESTINATION, + "dummy", + ), + ) + def _configure(config: DummyClientConfiguration = dlt.config.value) -> DummyClientConfiguration: + return config # _configure has argument of type DummyClientConfiguration that it returns # this type holds resolved configuration diff --git a/tests/common/data_writers/test_data_writers.py b/tests/common/data_writers/test_data_writers.py index 6cc7cb55ab..9b4e61a2f7 100644 --- a/tests/common/data_writers/test_data_writers.py +++ b/tests/common/data_writers/test_data_writers.py @@ -7,11 +7,9 @@ from dlt.common.data_writers.exceptions import DataWriterNotFound, SpecLookupFailed from dlt.common.typing import AnyFun -# from dlt.destinations.postgres import capabilities -from dlt.destinations.impl.redshift import capabilities as redshift_caps from dlt.common.data_writers.escape import ( escape_redshift_identifier, - escape_bigquery_identifier, + escape_hive_identifier, escape_redshift_literal, escape_postgres_literal, escape_duckdb_literal, @@ -29,8 +27,10 @@ DataWriter, DataWriterMetrics, EMPTY_DATA_WRITER_METRICS, + ImportFileWriter, InsertValuesWriter, JsonlWriter, + create_import_spec, get_best_writer_spec, resolve_best_writer_spec, is_native_writer, @@ -51,8 +51,10 @@ class _BytesIOWriter(DataWriter): @pytest.fixture def insert_writer() -> Iterator[DataWriter]: + from dlt.destinations import redshift + with io.StringIO() as f: - yield InsertValuesWriter(f, caps=redshift_caps()) + yield InsertValuesWriter(f, caps=redshift().capabilities()) @pytest.fixture @@ -154,7 +156,7 @@ def test_identifier_escape() -> None: def test_identifier_escape_bigquery() -> None: assert ( - escape_bigquery_identifier(", NULL'); DROP TABLE\"` -\\-") + escape_hive_identifier(", NULL'); DROP TABLE\"` -\\-") == "`, NULL'); DROP TABLE\"\\` -\\\\-`" ) @@ -259,3 +261,14 @@ def test_get_best_writer() -> None: assert WRITER_SPECS[get_best_writer_spec("arrow", "insert_values")] == ArrowToInsertValuesWriter with pytest.raises(DataWriterNotFound): get_best_writer_spec("arrow", "tsv") # type: ignore + + +def test_import_file_writer() -> None: + spec = create_import_spec("jsonl", ["jsonl"]) + assert spec.data_item_format == "file" + assert spec.file_format == "jsonl" + writer = DataWriter.writer_class_from_spec(spec) + assert writer is ImportFileWriter + w_ = writer(None) + with pytest.raises(NotImplementedError): + w_.write_header(None) diff --git a/tests/common/normalizers/snake_no_x.py b/tests/common/normalizers/snake_no_x.py new file mode 100644 index 0000000000..af3a53cbce --- /dev/null +++ b/tests/common/normalizers/snake_no_x.py @@ -0,0 +1,10 @@ +from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention + + +class NamingConvention(SnakeCaseNamingConvention): + def normalize_identifier(self, identifier: str) -> str: + identifier = super().normalize_identifier(identifier) + if identifier.endswith("x"): + print(identifier[:-1] + "_") + return identifier[:-1] + "_" + return identifier diff --git a/tests/common/normalizers/test_import_normalizers.py b/tests/common/normalizers/test_import_normalizers.py index df6b973943..85bb8ca1cc 100644 --- a/tests/common/normalizers/test_import_normalizers.py +++ b/tests/common/normalizers/test_import_normalizers.py @@ -4,10 +4,9 @@ from dlt.common.configuration.container import Container from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.normalizers import explicit_normalizers, import_normalizers +from dlt.common.normalizers.utils import explicit_normalizers, import_normalizers from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer -from dlt.common.normalizers.naming import snake_case -from dlt.common.normalizers.naming import direct +from dlt.common.normalizers.naming import snake_case, direct from dlt.common.normalizers.naming.exceptions import InvalidNamingModule, UnknownNamingModule from tests.common.normalizers.custom_normalizers import ( @@ -64,6 +63,17 @@ def test_import_normalizers() -> None: assert json_normalizer is CustomRelationalNormalizer +@pytest.mark.parametrize("sections", ("", "SOURCES__", "SOURCES__TEST_SCHEMA__")) +def test_config_sections(sections: str) -> None: + os.environ[f"{sections}SCHEMA__NAMING"] = "direct" + os.environ[f"{sections}SCHEMA__JSON_NORMALIZER"] = ( + '{"module": "tests.common.normalizers.custom_normalizers"}' + ) + config, _, _ = import_normalizers(explicit_normalizers(schema_name="test_schema")) + assert config["names"] == "direct" + assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} + + def test_import_normalizers_with_caps() -> None: # gets the naming convention from capabilities destination_caps = DestinationCapabilitiesContext.generic_capabilities() @@ -83,5 +93,6 @@ def test_import_invalid_naming_module() -> None: import_normalizers(explicit_normalizers("dlt.common.tests")) assert py_ex.value.naming_module == "dlt.common.tests" with pytest.raises(InvalidNamingModule) as py_ex2: - import_normalizers(explicit_normalizers("dlt.pipeline")) + import_normalizers(explicit_normalizers("dlt.pipeline.helpers")) assert py_ex2.value.naming_module == "dlt.pipeline" + assert py_ex2.value.naming_class == "helpers" diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 502ce619dd..159e33da4d 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -2,16 +2,15 @@ from dlt.common.typing import StrAny, DictStrAny from dlt.common.normalizers.naming import NamingConvention -from dlt.common.schema.typing import TSimpleRegex +from dlt.common.schema.typing import TColumnName, TSimpleRegex from dlt.common.utils import digest128, uniq_id -from dlt.common.schema import Schema, TTableSchema +from dlt.common.schema import Schema from dlt.common.schema.utils import new_table from dlt.common.normalizers.json.relational import ( RelationalNormalizerConfigPropagation, DataItemNormalizer as RelationalNormalizer, DLT_ID_LENGTH_BYTES, - TDataItemRow, ) # _flatten, _get_child_row_hash, _normalize_row, normalize_data_item, @@ -30,7 +29,7 @@ def test_flatten_fix_field_name(norm: RelationalNormalizer) -> None: "f 2": [], "f!3": {"f4": "a", "f-5": "b", "f*6": {"c": 7, "c v": 8, "c x": []}}, } - flattened_row, lists = norm._flatten("mock_table", row, 0) # type: ignore[arg-type] + flattened_row, lists = norm._flatten("mock_table", row, 0) assert "f_1" in flattened_row # assert "f_2" in flattened_row assert "f_3__f4" in flattened_row @@ -63,12 +62,12 @@ def test_preserve_complex_value(norm: RelationalNormalizer) -> None: ) ) row_1 = {"value": 1} - flattened_row, _ = norm._flatten("with_complex", row_1, 0) # type: ignore[arg-type] - assert flattened_row["value"] == 1 # type: ignore[typeddict-item] + flattened_row, _ = norm._flatten("with_complex", row_1, 0) + assert flattened_row["value"] == 1 row_2 = {"value": {"complex": True}} - flattened_row, _ = norm._flatten("with_complex", row_2, 0) # type: ignore[arg-type] - assert flattened_row["value"] == row_2["value"] # type: ignore[typeddict-item] + flattened_row, _ = norm._flatten("with_complex", row_2, 0) + assert flattened_row["value"] == row_2["value"] # complex value is not flattened assert "value__complex" not in flattened_row @@ -79,12 +78,12 @@ def test_preserve_complex_value_with_hint(norm: RelationalNormalizer) -> None: norm.schema._compile_settings() row_1 = {"value": 1} - flattened_row, _ = norm._flatten("any_table", row_1, 0) # type: ignore[arg-type] - assert flattened_row["value"] == 1 # type: ignore[typeddict-item] + flattened_row, _ = norm._flatten("any_table", row_1, 0) + assert flattened_row["value"] == 1 row_2 = {"value": {"complex": True}} - flattened_row, _ = norm._flatten("any_table", row_2, 0) # type: ignore[arg-type] - assert flattened_row["value"] == row_2["value"] # type: ignore[typeddict-item] + flattened_row, _ = norm._flatten("any_table", row_2, 0) + assert flattened_row["value"] == row_2["value"] # complex value is not flattened assert "value__complex" not in flattened_row @@ -94,7 +93,7 @@ def test_child_table_linking(norm: RelationalNormalizer) -> None: # request _dlt_root_id propagation add_dlt_root_id_propagation(norm) - rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # should have 7 entries (root + level 1 + 3 * list + 2 * object) assert len(rows) == 7 # root elem will not have a root hash if not explicitly added, "extend" is added only to child @@ -142,7 +141,7 @@ def test_child_table_linking_primary_key(norm: RelationalNormalizer) -> None: norm.schema.merge_hints({"primary_key": [TSimpleRegex("id")]}) norm.schema._compile_settings() - rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) root = next(t for t in rows if t[0][0] == "table")[1] # record hash is random for primary keys, not based on their content # this is a change introduced in dlt 0.2.0a30 @@ -172,7 +171,7 @@ def test_yields_parents_first(norm: RelationalNormalizer) -> None: "f": [{"id": "level1", "l": ["a", "b", "c"], "v": 120, "o": [{"a": 1}, {"a": 2}]}], "g": [{"id": "level2_g", "l": ["a"]}], } - rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) tables = list(r[0][0] for r in rows) # child tables are always yielded before parent tables expected_tables = [ @@ -218,7 +217,7 @@ def test_yields_parent_relation(norm: RelationalNormalizer) -> None: } ], } - rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # normalizer must return parent table first and move in order of the list elements when yielding child tables # the yielding order if fully defined expected_parents = [ @@ -276,10 +275,10 @@ def test_yields_parent_relation(norm: RelationalNormalizer) -> None: def test_list_position(norm: RelationalNormalizer) -> None: - row: StrAny = { + row: DictStrAny = { "f": [{"l": ["a", "b", "c"], "v": 120, "lo": [{"e": "a"}, {"e": "b"}, {"e": "c"}]}] } - rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # root has no pos root = [t for t in rows if t[0][0] == "table"][0][1] assert "_dlt_list_idx" not in root @@ -290,13 +289,13 @@ def test_list_position(norm: RelationalNormalizer) -> None: # f_l must be ordered as it appears in the list for pos, elem in enumerate(["a", "b", "c"]): - row = next(t[1] for t in rows if t[0][0] == "table__f__l" and t[1]["value"] == elem) - assert row["_dlt_list_idx"] == pos + row_1 = next(t[1] for t in rows if t[0][0] == "table__f__l" and t[1]["value"] == elem) + assert row_1["_dlt_list_idx"] == pos # f_lo must be ordered - list of objects for pos, elem in enumerate(["a", "b", "c"]): - row = next(t[1] for t in rows if t[0][0] == "table__f__lo" and t[1]["e"] == elem) - assert row["_dlt_list_idx"] == pos + row_2 = next(t[1] for t in rows if t[0][0] == "table__f__lo" and t[1]["e"] == elem) + assert row_2["_dlt_list_idx"] == pos # def test_list_of_lists(norm: RelationalNormalizer) -> None: @@ -430,7 +429,7 @@ def test_child_row_deterministic_hash(norm: RelationalNormalizer) -> None: "_dlt_id": row_id, "f": [{"l": ["a", "b", "c"], "v": 120, "lo": [{"e": "a"}, {"e": "b"}, {"e": "c"}]}], } - rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) children = [t for t in rows if t[0][0] != "table"] # all hashes must be different distinct_hashes = set([ch[1]["_dlt_id"] for ch in children]) @@ -449,19 +448,19 @@ def test_child_row_deterministic_hash(norm: RelationalNormalizer) -> None: assert f_lo_p2["_dlt_id"] == digest128(f"{el_f['_dlt_id']}_table__f__lo_2", DLT_ID_LENGTH_BYTES) # same data with same table and row_id - rows_2 = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows_2 = list(norm._normalize_row(row, {}, ("table",))) children_2 = [t for t in rows_2 if t[0][0] != "table"] # corresponding hashes must be identical assert all(ch[0][1]["_dlt_id"] == ch[1][1]["_dlt_id"] for ch in zip(children, children_2)) # change parent table and all child hashes must be different - rows_4 = list(norm._normalize_row(row, {}, ("other_table",))) # type: ignore[arg-type] + rows_4 = list(norm._normalize_row(row, {}, ("other_table",))) children_4 = [t for t in rows_4 if t[0][0] != "other_table"] assert all(ch[0][1]["_dlt_id"] != ch[1][1]["_dlt_id"] for ch in zip(children, children_4)) # change parent hash and all child hashes must be different row["_dlt_id"] = uniq_id() - rows_3 = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows_3 = list(norm._normalize_row(row, {}, ("table",))) children_3 = [t for t in rows_3 if t[0][0] != "table"] assert all(ch[0][1]["_dlt_id"] != ch[1][1]["_dlt_id"] for ch in zip(children, children_3)) @@ -469,14 +468,16 @@ def test_child_row_deterministic_hash(norm: RelationalNormalizer) -> None: def test_keeps_dlt_id(norm: RelationalNormalizer) -> None: h = uniq_id() row = {"a": "b", "_dlt_id": h} - rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) root = [t for t in rows if t[0][0] == "table"][0][1] assert root["_dlt_id"] == h def test_propagate_hardcoded_context(norm: RelationalNormalizer) -> None: row = {"level": 1, "list": ["a", "b", "c"], "comp": [{"_timestamp": "a"}]} - rows = list(norm._normalize_row(row, {"_timestamp": 1238.9, "_dist_key": "SENDER_3000"}, ("table",))) # type: ignore[arg-type] + rows = list( + norm._normalize_row(row, {"_timestamp": 1238.9, "_dist_key": "SENDER_3000"}, ("table",)) + ) # context is not added to root element root = next(t for t in rows if t[0][0] == "table")[1] assert "_timestamp" in root @@ -506,7 +507,7 @@ def test_propagates_root_context(norm: RelationalNormalizer) -> None: "dependent_list": [1, 2, 3], "dependent_objects": [{"vx": "ax"}], } - normalized_rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + normalized_rows = list(norm._normalize_row(row, {}, ("table",))) # all non-root rows must have: non_root = [r for r in normalized_rows if r[0][1] is not None] assert all(r[1]["_dlt_root_id"] == "###" for r in non_root) @@ -522,12 +523,12 @@ def test_propagates_table_context( prop_config: RelationalNormalizerConfigPropagation = norm.schema._normalizers_config["json"][ "config" ]["propagation"] - prop_config["root"]["timestamp"] = "_partition_ts" # type: ignore[index] + prop_config["root"][TColumnName("timestamp")] = TColumnName("_partition_ts") # for table "table__lvl1" request to propagate "vx" and "partition_ovr" as "_partition_ts" (should overwrite root) - prop_config["tables"]["table__lvl1"] = { # type: ignore[index] - "vx": "__vx", - "partition_ovr": "_partition_ts", - "__not_found": "__not_found", + prop_config["tables"]["table__lvl1"] = { + TColumnName("vx"): TColumnName("__vx"), + TColumnName("partition_ovr"): TColumnName("_partition_ts"), + TColumnName("__not_found"): TColumnName("__not_found"), } if add_pk: @@ -545,7 +546,7 @@ def test_propagates_table_context( # to reproduce a bug where rows with _dlt_id set were not extended row["lvl1"][0]["_dlt_id"] = "row_id_lvl1" # type: ignore[index] - normalized_rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + normalized_rows = list(norm._normalize_row(row, {}, ("table",))) non_root = [r for r in normalized_rows if r[0][1] is not None] # _dlt_root_id in all non root assert all(r[1]["_dlt_root_id"] == "###" for r in non_root) @@ -574,10 +575,10 @@ def test_propagates_table_context_to_lists(norm: RelationalNormalizer) -> None: prop_config: RelationalNormalizerConfigPropagation = norm.schema._normalizers_config["json"][ "config" ]["propagation"] - prop_config["root"]["timestamp"] = "_partition_ts" # type: ignore[index] + prop_config["root"][TColumnName("timestamp")] = TColumnName("_partition_ts") row = {"_dlt_id": "###", "timestamp": 12918291.1212, "lvl1": [1, 2, 3, [4, 5, 6]]} - normalized_rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] + normalized_rows = list(norm._normalize_row(row, {}, ("table",))) # _partition_ts == timestamp on all child tables non_root = [r for r in normalized_rows if r[0][1] is not None] assert all(r[1]["_partition_ts"] == 12918291.1212 for r in non_root) @@ -590,7 +591,7 @@ def test_removes_normalized_list(norm: RelationalNormalizer) -> None: # after normalizing the list that got normalized into child table must be deleted row = {"comp": [{"_timestamp": "a"}]} # get iterator - normalized_rows_i = norm._normalize_row(row, {}, ("table",)) # type: ignore[arg-type] + normalized_rows_i = norm._normalize_row(row, {}, ("table",)) # yield just one item root_row = next(normalized_rows_i) # root_row = next(r for r in normalized_rows if r[0][1] is None) @@ -614,7 +615,7 @@ def test_preserves_complex_types_list(norm: RelationalNormalizer) -> None: ) ) row = {"value": ["from", {"complex": True}]} - normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",))) # type: ignore[arg-type] + normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",))) # make sure only 1 row is emitted, the list is not normalized assert len(normalized_rows) == 1 # value is kept in root row -> market as complex @@ -623,7 +624,7 @@ def test_preserves_complex_types_list(norm: RelationalNormalizer) -> None: # same should work for a list row = {"value": ["from", ["complex", True]]} # type: ignore[list-item] - normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",))) # type: ignore[arg-type] + normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",))) # make sure only 1 row is emitted, the list is not normalized assert len(normalized_rows) == 1 # value is kept in root row -> market as complex @@ -735,7 +736,7 @@ def test_table_name_meta_normalized() -> None: def test_parse_with_primary_key() -> None: schema = create_schema_with_name("discord") - schema.merge_hints({"primary_key": ["id"]}) # type: ignore[list-item] + schema._merge_hints({"primary_key": ["id"]}) # type: ignore[list-item] schema._compile_settings() add_dlt_root_id_propagation(schema.data_item_normalizer) # type: ignore[arg-type] diff --git a/tests/common/normalizers/test_naming.py b/tests/common/normalizers/test_naming.py index 3bf4762c35..27325ab3cc 100644 --- a/tests/common/normalizers/test_naming.py +++ b/tests/common/normalizers/test_naming.py @@ -266,8 +266,9 @@ def test_shorten_fragments(convention: Type[NamingConvention]) -> None: assert naming.shorten_fragments(*RAW_PATH_WITH_EMPTY_IDENT) == norm_path -# 'event__parse_data__response_selector__default__response__response_templates' -# E 'event__parse_data__response_selector__default__response__responses' +def test_naming_convention_name() -> None: + assert SnakeCaseNamingConvention.name() == "snake_case" + assert DirectNamingConvention.name() == "direct" def assert_short_path(norm_path: str, naming: NamingConvention) -> None: diff --git a/tests/common/schema/conftest.py b/tests/common/schema/conftest.py new file mode 100644 index 0000000000..53d02fc663 --- /dev/null +++ b/tests/common/schema/conftest.py @@ -0,0 +1,25 @@ +import pytest + +from dlt.common.configuration import resolve_configuration +from dlt.common.schema import Schema +from dlt.common.storages import SchemaStorageConfiguration, SchemaStorage + + +from tests.utils import autouse_test_storage, preserve_environ + + +@pytest.fixture +def schema() -> Schema: + return Schema("event") + + +@pytest.fixture +def schema_storage() -> SchemaStorage: + C = resolve_configuration( + SchemaStorageConfiguration(), + explicit_value={ + "import_schema_path": "tests/common/cases/schemas/rasa", + "external_schema_format": "json", + }, + ) + return SchemaStorage(C, makedirs=True) diff --git a/tests/common/schema/test_filtering.py b/tests/common/schema/test_filtering.py index 8cfac9309f..6634a38aa6 100644 --- a/tests/common/schema/test_filtering.py +++ b/tests/common/schema/test_filtering.py @@ -10,11 +10,6 @@ from tests.common.utils import load_json_case -@pytest.fixture -def schema() -> Schema: - return Schema("event") - - def test_row_field_filter(schema: Schema) -> None: _add_excludes(schema) bot_case: DictStrAny = load_json_case("mod_bot_case") diff --git a/tests/common/schema/test_inference.py b/tests/common/schema/test_inference.py index 0a40953f53..e2821d5626 100644 --- a/tests/common/schema/test_inference.py +++ b/tests/common/schema/test_inference.py @@ -1,3 +1,4 @@ +import os import pytest from copy import deepcopy from typing import Any, List @@ -16,11 +17,6 @@ from tests.common.utils import load_json_case -@pytest.fixture -def schema() -> Schema: - return Schema("event") - - def test_get_preferred_type(schema: Schema) -> None: _add_preferred_types(schema) @@ -204,11 +200,10 @@ def test_shorten_variant_column(schema: Schema) -> None: } _, new_table = schema.coerce_row("event_user", None, row_1) # schema assumes that identifiers are already normalized so confidence even if it is longer than 9 chars - schema.update_table(new_table) + schema.update_table(new_table, normalize_identifiers=False) assert "confidence" in schema.tables["event_user"]["columns"] # confidence_123456 # now variant is created and this will be normalized - # TODO: we should move the handling of variants to normalizer new_row_2, new_table = schema.coerce_row("event_user", None, {"confidence": False}) tag = schema.naming._compute_tag( "confidence__v_bool", collision_prob=schema.naming._DEFAULT_COLLISION_PROB @@ -219,6 +214,9 @@ def test_shorten_variant_column(schema: Schema) -> None: def test_coerce_complex_variant(schema: Schema) -> None: + # for this test use case sensitive naming convention + os.environ["SCHEMA__NAMING"] = "direct" + schema.update_normalizers() # create two columns to which complex type cannot be coerced row = {"floatX": 78172.128, "confidenceX": 1.2, "strX": "STR"} new_row, new_table = schema.coerce_row("event_user", None, row) @@ -252,12 +250,12 @@ def test_coerce_complex_variant(schema: Schema) -> None: c_new_columns_v = list(c_new_table_v["columns"].values()) # two new variant columns added assert len(c_new_columns_v) == 2 - assert c_new_columns_v[0]["name"] == "floatX__v_complex" - assert c_new_columns_v[1]["name"] == "confidenceX__v_complex" + assert c_new_columns_v[0]["name"] == "floatX▶v_complex" + assert c_new_columns_v[1]["name"] == "confidenceX▶v_complex" assert c_new_columns_v[0]["variant"] is True assert c_new_columns_v[1]["variant"] is True - assert c_new_row_v["floatX__v_complex"] == v_list - assert c_new_row_v["confidenceX__v_complex"] == v_dict + assert c_new_row_v["floatX▶v_complex"] == v_list + assert c_new_row_v["confidenceX▶v_complex"] == v_dict assert c_new_row_v["strX"] == json.dumps(v_dict) schema.update_table(c_new_table_v) @@ -265,8 +263,8 @@ def test_coerce_complex_variant(schema: Schema) -> None: c_row_v = {"floatX": v_list, "confidenceX": v_dict, "strX": v_dict} c_new_row_v, c_new_table_v = schema.coerce_row("event_user", None, c_row_v) assert c_new_table_v is None - assert c_new_row_v["floatX__v_complex"] == v_list - assert c_new_row_v["confidenceX__v_complex"] == v_dict + assert c_new_row_v["floatX▶v_complex"] == v_list + assert c_new_row_v["confidenceX▶v_complex"] == v_dict assert c_new_row_v["strX"] == json.dumps(v_dict) @@ -539,7 +537,7 @@ def test_infer_on_incomplete_column(schema: Schema) -> None: incomplete_col["primary_key"] = True incomplete_col["x-special"] = "spec" # type: ignore[typeddict-unknown-key] table = utils.new_table("table", columns=[incomplete_col]) - schema.update_table(table) + schema.update_table(table, normalize_identifiers=False) # make sure that column is still incomplete and has no default hints assert schema.get_table("table")["columns"]["I"] == { "name": "I", diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 8516414abd..893fd1db5f 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -2,10 +2,9 @@ import pytest from copy import copy, deepcopy -from dlt.common.schema import Schema, utils +from dlt.common.schema import utils from dlt.common.schema.exceptions import ( CannotCoerceColumnException, - CannotCoerceNullException, TablePropertiesConflictException, ) from dlt.common.schema.typing import TColumnSchemaBase, TStoredSchema, TTableSchema, TColumnSchema @@ -294,10 +293,10 @@ def test_diff_tables() -> None: empty = utils.new_table("table") del empty["resource"] print(empty) - partial = utils.diff_table(empty, deepcopy(table)) + partial = utils.diff_table("schema", empty, deepcopy(table)) # partial is simply table assert partial == table - partial = utils.diff_table(deepcopy(table), empty) + partial = utils.diff_table("schema", deepcopy(table), empty) # partial is empty assert partial == empty @@ -305,7 +304,7 @@ def test_diff_tables() -> None: changed = deepcopy(table) changed["description"] = "new description" changed["name"] = "new name" - partial = utils.diff_table(deepcopy(table), changed) + partial = utils.diff_table("schema", deepcopy(table), changed) print(partial) assert partial == {"name": "new name", "description": "new description", "columns": {}} @@ -313,7 +312,7 @@ def test_diff_tables() -> None: existing = deepcopy(table) changed["write_disposition"] = "append" changed["schema_contract"] = "freeze" - partial = utils.diff_table(deepcopy(existing), changed) + partial = utils.diff_table("schema", deepcopy(existing), changed) assert partial == { "name": "new name", "description": "new description", @@ -323,14 +322,14 @@ def test_diff_tables() -> None: } existing["write_disposition"] = "append" existing["schema_contract"] = "freeze" - partial = utils.diff_table(deepcopy(existing), changed) + partial = utils.diff_table("schema", deepcopy(existing), changed) assert partial == {"name": "new name", "description": "new description", "columns": {}} # detect changed column existing = deepcopy(table) changed = deepcopy(table) changed["columns"]["test"]["cluster"] = True - partial = utils.diff_table(existing, changed) + partial = utils.diff_table("schema", existing, changed) assert "test" in partial["columns"] assert "test_2" not in partial["columns"] assert existing["columns"]["test"] == table["columns"]["test"] != partial["columns"]["test"] @@ -339,7 +338,7 @@ def test_diff_tables() -> None: existing = deepcopy(table) changed = deepcopy(table) changed["columns"]["test"]["foreign_key"] = False - partial = utils.diff_table(existing, changed) + partial = utils.diff_table("schema", existing, changed) assert "test" in partial["columns"] # even if not present in tab_a at all @@ -347,7 +346,7 @@ def test_diff_tables() -> None: changed = deepcopy(table) changed["columns"]["test"]["foreign_key"] = False del existing["columns"]["test"]["foreign_key"] - partial = utils.diff_table(existing, changed) + partial = utils.diff_table("schema", existing, changed) assert "test" in partial["columns"] @@ -363,7 +362,7 @@ def test_diff_tables_conflicts() -> None: other = utils.new_table("table_2") with pytest.raises(TablePropertiesConflictException) as cf_ex: - utils.diff_table(table, other) + utils.diff_table("schema", table, other) assert cf_ex.value.table_name == "table" assert cf_ex.value.prop_name == "parent" @@ -371,7 +370,7 @@ def test_diff_tables_conflicts() -> None: changed = deepcopy(table) changed["columns"]["test"]["data_type"] = "bigint" with pytest.raises(CannotCoerceColumnException): - utils.diff_table(table, changed) + utils.diff_table("schema", table, changed) def test_merge_tables() -> None: @@ -391,7 +390,7 @@ def test_merge_tables() -> None: changed["new-prop-3"] = False # type: ignore[typeddict-unknown-key] # drop column so partial has it del table["columns"]["test"] - partial = utils.merge_table(table, changed) + partial = utils.merge_table("schema", table, changed) assert "test" in table["columns"] assert table["x-special"] == 129 # type: ignore[typeddict-item] assert table["description"] == "new description" @@ -420,7 +419,7 @@ def test_merge_tables_incomplete_columns() -> None: changed["columns"] = deepcopy({"test": COL_1_HINTS, "test_2": COL_2_HINTS}) # it is completed now changed["columns"]["test_2"]["data_type"] = "bigint" - partial = utils.merge_table(table, changed) + partial = utils.merge_table("schema", table, changed) assert list(partial["columns"].keys()) == ["test_2"] # test_2 goes to the end, it was incomplete in table so it got dropped before update assert list(table["columns"].keys()) == ["test", "test_2"] @@ -435,7 +434,7 @@ def test_merge_tables_incomplete_columns() -> None: changed["columns"] = deepcopy({"test": COL_1_HINTS, "test_2": COL_2_HINTS}) # still incomplete but changed changed["columns"]["test_2"]["nullable"] = False - partial = utils.merge_table(table, changed) + partial = utils.merge_table("schema", table, changed) assert list(partial["columns"].keys()) == ["test_2"] # incomplete -> incomplete stays in place assert list(table["columns"].keys()) == ["test_2", "test"] diff --git a/tests/common/schema/test_normalize_identifiers.py b/tests/common/schema/test_normalize_identifiers.py new file mode 100644 index 0000000000..b71977a5fd --- /dev/null +++ b/tests/common/schema/test_normalize_identifiers.py @@ -0,0 +1,412 @@ +from copy import deepcopy +import os +from typing import Callable +import pytest + +from dlt.common import json +from dlt.common.configuration import resolve_configuration +from dlt.common.configuration.container import Container +from dlt.common.normalizers.naming.naming import NamingConvention +from dlt.common.storages import SchemaStorageConfiguration +from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.normalizers.naming import snake_case, direct +from dlt.common.schema import TColumnSchema, Schema, TStoredSchema, utils +from dlt.common.schema.exceptions import TableIdentifiersFrozen +from dlt.common.schema.typing import SIMPLE_REGEX_PREFIX +from dlt.common.storages import SchemaStorage + +from tests.common.cases.normalizers import sql_upper +from tests.common.utils import load_json_case, load_yml_case + + +@pytest.fixture +def schema_storage_no_import() -> SchemaStorage: + C = resolve_configuration(SchemaStorageConfiguration()) + return SchemaStorage(C, makedirs=True) + + +@pytest.fixture +def cn_schema() -> Schema: + return Schema( + "column_default", + { + "names": "tests.common.normalizers.custom_normalizers", + "json": { + "module": "tests.common.normalizers.custom_normalizers", + "config": {"not_null": ["fake_id"]}, + }, + }, + ) + + +def test_save_store_schema_custom_normalizers( + cn_schema: Schema, schema_storage: SchemaStorage +) -> None: + schema_storage.save_schema(cn_schema) + schema_copy = schema_storage.load_schema(cn_schema.name) + assert_new_schema_values_custom_normalizers(schema_copy) + + +def test_new_schema_custom_normalizers(cn_schema: Schema) -> None: + assert_new_schema_values_custom_normalizers(cn_schema) + + +def test_save_load_incomplete_column( + schema: Schema, schema_storage_no_import: SchemaStorage +) -> None: + # make sure that incomplete column is saved and restored without default hints + incomplete_col = utils.new_column("I", nullable=False) + incomplete_col["primary_key"] = True + incomplete_col["x-special"] = "spec" # type: ignore[typeddict-unknown-key] + table = utils.new_table("table", columns=[incomplete_col]) + schema.update_table(table, normalize_identifiers=False) + schema_storage_no_import.save_schema(schema) + schema_copy = schema_storage_no_import.load_schema("event") + assert schema_copy.get_table("table")["columns"]["I"] == { + "name": "I", + "nullable": False, + "primary_key": True, + "x-special": "spec", + } + + +def test_schema_config_normalizers(schema: Schema, schema_storage_no_import: SchemaStorage) -> None: + # save snake case schema + assert schema._normalizers_config["names"] == "snake_case" + schema_storage_no_import.save_schema(schema) + # config direct naming convention + os.environ["SCHEMA__NAMING"] = "direct" + # new schema has direct naming convention + schema_direct_nc = Schema("direct_naming") + schema_storage_no_import.save_schema(schema_direct_nc) + assert schema_direct_nc._normalizers_config["names"] == "direct" + # still after loading the config is "snake" + schema = schema_storage_no_import.load_schema(schema.name) + assert schema._normalizers_config["names"] == "snake_case" + # provide capabilities context + destination_caps = DestinationCapabilitiesContext.generic_capabilities() + destination_caps.naming_convention = "sql_cs_v1" + destination_caps.max_identifier_length = 127 + with Container().injectable_context(destination_caps): + # caps are ignored if schema is configured + schema_direct_nc = Schema("direct_naming") + assert schema_direct_nc._normalizers_config["names"] == "direct" + # but length is there + assert schema_direct_nc.naming.max_length == 127 + # when loading schema configuration is ignored + schema = schema_storage_no_import.load_schema(schema.name) + assert schema._normalizers_config["names"] == "snake_case" + assert schema.naming.max_length == 127 + # but if we ask to update normalizers config schema is applied + schema.update_normalizers() + assert schema._normalizers_config["names"] == "direct" + + # load schema_direct_nc (direct) + schema_direct_nc = schema_storage_no_import.load_schema(schema_direct_nc.name) + assert schema_direct_nc._normalizers_config["names"] == "direct" + + # drop config + del os.environ["SCHEMA__NAMING"] + schema_direct_nc = schema_storage_no_import.load_schema(schema_direct_nc.name) + assert schema_direct_nc._normalizers_config["names"] == "direct" + + +def test_schema_normalizers_no_config( + schema: Schema, schema_storage_no_import: SchemaStorage +) -> None: + # convert schema to direct and save + os.environ["SCHEMA__NAMING"] = "direct" + schema.update_normalizers() + assert schema._normalizers_config["names"] == "direct" + schema_storage_no_import.save_schema(schema) + # make sure we drop the config correctly + del os.environ["SCHEMA__NAMING"] + schema_test = Schema("test") + assert schema_test.naming.name() == "snake_case" + # use capabilities without default naming convention + destination_caps = DestinationCapabilitiesContext.generic_capabilities() + assert destination_caps.naming_convention is None + destination_caps.max_identifier_length = 66 + with Container().injectable_context(destination_caps): + schema_in_caps = Schema("schema_in_caps") + assert schema_in_caps._normalizers_config["names"] == "snake_case" + assert schema_in_caps.naming.name() == "snake_case" + assert schema_in_caps.naming.max_length == 66 + schema_in_caps.update_normalizers() + assert schema_in_caps.naming.name() == "snake_case" + # old schema preserves convention when loaded + schema = schema_storage_no_import.load_schema(schema.name) + assert schema._normalizers_config["names"] == "direct" + # update normalizer no effect + schema.update_normalizers() + assert schema._normalizers_config["names"] == "direct" + assert schema.naming.max_length == 66 + + # use caps with default naming convention + destination_caps = DestinationCapabilitiesContext.generic_capabilities() + destination_caps.naming_convention = "sql_cs_v1" + destination_caps.max_identifier_length = 127 + with Container().injectable_context(destination_caps): + schema_in_caps = Schema("schema_in_caps") + # new schema gets convention from caps + assert schema_in_caps._normalizers_config["names"] == "sql_cs_v1" + # old schema preserves convention when loaded + schema = schema_storage_no_import.load_schema(schema.name) + assert schema._normalizers_config["names"] == "direct" + # update changes to caps schema + schema.update_normalizers() + assert schema._normalizers_config["names"] == "sql_cs_v1" + assert schema.naming.max_length == 127 + + +@pytest.mark.parametrize("section", ("SOURCES__SCHEMA__NAMING", "SOURCES__THIS__SCHEMA__NAMING")) +def test_config_with_section(section: str) -> None: + os.environ["SOURCES__OTHER__SCHEMA__NAMING"] = "direct" + os.environ[section] = "sql_cs_v1" + this_schema = Schema("this") + that_schema = Schema("that") + assert this_schema.naming.name() == "sql_cs_v1" + expected_that_schema = ( + "snake_case" if section == "SOURCES__THIS__SCHEMA__NAMING" else "sql_cs_v1" + ) + assert that_schema.naming.name() == expected_that_schema + + # test update normalizers + os.environ[section] = "direct" + expected_that_schema = "snake_case" if section == "SOURCES__THIS__SCHEMA__NAMING" else "direct" + this_schema.update_normalizers() + assert this_schema.naming.name() == "direct" + that_schema.update_normalizers() + assert that_schema.naming.name() == expected_that_schema + + +def test_normalize_table_identifiers() -> None: + # load with snake case + schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") + schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] + issues_table = schema.tables["issues"] + issues_table_str = json.dumps(issues_table) + # normalize table to upper + issues_table_norm = utils.normalize_table_identifiers( + issues_table, sql_upper.NamingConvention() + ) + # nothing got changes in issues table + assert issues_table_str == json.dumps(issues_table) + # check normalization + assert issues_table_norm["name"] == "ISSUES" + assert "REACTIONS___1" in issues_table_norm["columns"] + # subsequent normalization does not change dict + assert issues_table_norm == utils.normalize_table_identifiers( + issues_table_norm, sql_upper.NamingConvention() + ) + + +def test_normalize_table_identifiers_idempotent() -> None: + schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") + schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] + # assert column generated from "reactions/+1" and "-1", it is a valid identifier even with three underscores + assert "reactions___1" in schema.tables["issues"]["columns"] + issues_table = schema.tables["issues"] + # this schema is already normalized so normalization is idempotent + assert schema.tables["issues"] == utils.normalize_table_identifiers(issues_table, schema.naming) + assert schema.tables["issues"] == utils.normalize_table_identifiers( + utils.normalize_table_identifiers(issues_table, schema.naming), schema.naming + ) + + +def test_normalize_table_identifiers_merge_columns() -> None: + # create conflicting columns + table_create = [ + {"name": "case", "data_type": "bigint", "nullable": False, "x-description": "desc"}, + {"name": "Case", "data_type": "double", "nullable": True, "primary_key": True}, + ] + # schema normalizing to snake case will conflict on case and Case + table = utils.new_table("blend", columns=table_create) # type: ignore[arg-type] + table_str = json.dumps(table) + norm_table = utils.normalize_table_identifiers(table, Schema("norm").naming) + # nothing got changed in original table + assert table_str == json.dumps(table) + # only one column + assert len(norm_table["columns"]) == 1 + assert norm_table["columns"]["case"] == { + "nullable": False, # remove default, preserve non default + "primary_key": True, + "name": "case", + "data_type": "double", + "x-description": "desc", + } + + +def test_update_normalizers() -> None: + schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") + schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] + # drop seen data + del schema.tables["issues"]["x-normalizer"] + del schema.tables["issues__labels"]["x-normalizer"] + del schema.tables["issues__assignees"]["x-normalizer"] + # save default hints in original form + default_hints = schema._settings["default_hints"] + + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper" + schema.update_normalizers() + assert isinstance(schema.naming, sql_upper.NamingConvention) + # print(schema.to_pretty_yaml()) + assert_schema_identifiers_case(schema, str.upper) + + # resource must be old name + assert schema.tables["ISSUES"]["resource"] == "issues" + + # make sure normalizer config is replaced + assert schema._normalizers_config["names"] == "tests.common.cases.normalizers.sql_upper" + assert "allow_identifier_change_on_table_with_data" not in schema._normalizers_config + + # regexes are uppercased + new_default_hints = schema._settings["default_hints"] + for hint, regexes in default_hints.items(): + # same number of hints + assert len(regexes) == len(new_default_hints[hint]) + # but all upper cased + assert set(n.upper() for n in regexes) == set(new_default_hints[hint]) + + +def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> None: + # use destination caps to force naming convention + from dlt.common.destination import DestinationCapabilitiesContext + from dlt.common.configuration.container import Container + + eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + orig_schema = Schema.from_dict(eth_V9) + # save schema + schema_storage_no_import.save_schema(orig_schema) + + with Container().injectable_context( + DestinationCapabilitiesContext.generic_capabilities( + naming_convention=sql_upper.NamingConvention() + ) + ) as caps: + assert isinstance(caps.naming_convention, sql_upper.NamingConvention) + # creating a schema from dict keeps original normalizers + schema = Schema.from_dict(eth_V9) + assert_schema_identifiers_case(schema, str.lower) + assert schema._normalizers_config["names"].endswith("snake_case") # type: ignore + + # loading from storage keeps storage normalizers + storage_schema = schema_storage_no_import.load_schema("ethereum") + assert_schema_identifiers_case(storage_schema, str.lower) + assert storage_schema._normalizers_config["names"].endswith("snake_case") # type: ignore + + # new schema instance is created using caps/config + new_schema = Schema("new") + assert_schema_identifiers_case(new_schema, str.upper) + assert isinstance(new_schema._normalizers_config["names"], NamingConvention) + + # attempt to update normalizers blocked by tables with data + with pytest.raises(TableIdentifiersFrozen): + schema.update_normalizers() + # also cloning with update normalizers + with pytest.raises(TableIdentifiersFrozen): + schema.clone(update_normalizers=True) + + # remove processing hints and normalize + norm_cloned = schema.clone(update_normalizers=True, remove_processing_hints=True) + assert_schema_identifiers_case(norm_cloned, str.upper) + assert isinstance(norm_cloned._normalizers_config["names"], NamingConvention) + + norm_schema = Schema.from_dict( + deepcopy(eth_V9), remove_processing_hints=True, bump_version=False + ) + norm_schema.update_normalizers() + assert_schema_identifiers_case(norm_schema, str.upper) + assert isinstance(norm_schema._normalizers_config["names"], NamingConvention) + + # both ways of obtaining schemas (cloning, cleaning dict) must generate identical schemas + assert norm_cloned.to_pretty_json() == norm_schema.to_pretty_json() + + # save to storage + schema_storage_no_import.save_schema(norm_cloned) + + # load schema out of caps + storage_schema = schema_storage_no_import.load_schema("ethereum") + assert_schema_identifiers_case(storage_schema, str.upper) + # the instance got converted into + assert storage_schema._normalizers_config["names"].endswith("sql_upper.NamingConvention") # type: ignore + assert storage_schema.stored_version_hash == storage_schema.version_hash + # cloned when bumped must have same version hash + norm_cloned._bump_version() + assert storage_schema.stored_version_hash == norm_cloned.stored_version_hash + + +def test_raise_on_change_identifier_table_with_data() -> None: + schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") + schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] + # mark issues table to seen data and change naming to sql upper + issues_table = schema.tables["issues"] + issues_table["x-normalizer"] = {"seen-data": True} + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper" + with pytest.raises(TableIdentifiersFrozen) as fr_ex: + schema.update_normalizers() + assert fr_ex.value.table_name == "issues" + assert isinstance(fr_ex.value.from_naming, snake_case.NamingConvention) + assert isinstance(fr_ex.value.to_naming, sql_upper.NamingConvention) + # try again, get exception (schema was not partially modified) + with pytest.raises(TableIdentifiersFrozen) as fr_ex: + schema.update_normalizers() + + # use special naming convention that only changes column names ending with x to _ + issues_table["columns"]["columnx"] = {"name": "columnx", "data_type": "bigint"} + assert schema.tables["issues"] is issues_table + os.environ["SCHEMA__NAMING"] = "tests.common.normalizers.snake_no_x" + with pytest.raises(TableIdentifiersFrozen) as fr_ex: + schema.update_normalizers() + assert fr_ex.value.table_name == "issues" + # allow to change tables with data + os.environ["SCHEMA__ALLOW_IDENTIFIER_CHANGE_ON_TABLE_WITH_DATA"] = "True" + schema.update_normalizers() + assert schema._normalizers_config["allow_identifier_change_on_table_with_data"] is True + + +def assert_schema_identifiers_case(schema: Schema, casing: Callable[[str], str]) -> None: + for table_name, table in schema.tables.items(): + assert table_name == casing(table_name) == table["name"] + if "parent" in table: + assert table["parent"] == casing(table["parent"]) + for col_name, column in table["columns"].items(): + assert col_name == casing(col_name) == column["name"] + + # make sure table prefixes are set + assert schema._dlt_tables_prefix == casing("_dlt") + assert schema.loads_table_name == casing("_dlt_loads") + assert schema.version_table_name == casing("_dlt_version") + assert schema.state_table_name == casing("_dlt_pipeline_state") + + def _case_regex(regex: str) -> str: + if regex.startswith(SIMPLE_REGEX_PREFIX): + return SIMPLE_REGEX_PREFIX + casing(regex[3:]) + else: + return casing(regex) + + # regexes are uppercased + new_default_hints = schema._settings["default_hints"] + for hint, regexes in new_default_hints.items(): + # but all upper cased + assert set(_case_regex(n) for n in regexes) == set(new_default_hints[hint]) + + +def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: + # check normalizers config + assert schema._normalizers_config["names"] == "tests.common.normalizers.custom_normalizers" + assert ( + schema._normalizers_config["json"]["module"] + == "tests.common.normalizers.custom_normalizers" + ) + # check if schema was extended by json normalizer + assert ["fake_id"] == schema.settings["default_hints"]["not_null"] + # call normalizers + assert schema.naming.normalize_identifier("a") == "column_a" + assert schema.naming.normalize_path("a__b") == "column_a__column_b" + assert schema.naming.normalize_identifier("1A_b") == "column_1a_b" + # assumes elements are normalized + assert schema.naming.make_path("A", "B", "!C") == "A__B__!C" + assert schema.naming.break_path("A__B__!C") == ["A", "B", "!C"] + row = list(schema.normalize_data_item({"bool": True}, "load_id", "a_table")) + assert row[0] == (("a_table", None), {"bool": True}) diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 887b0aa9a0..93be165358 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -1,19 +1,17 @@ -from copy import deepcopy import os -from typing import List, Sequence, cast +from typing import Dict, List, Sequence import pytest +from copy import deepcopy from dlt.common import pendulum -from dlt.common.configuration import resolve_configuration -from dlt.common.configuration.container import Container +from dlt.common.json import json +from dlt.common.data_types.typing import TDataType from dlt.common.schema.migrations import migrate_schema -from dlt.common.storages import SchemaStorageConfiguration -from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.exceptions import DictValidationException -from dlt.common.normalizers.naming import snake_case, direct +from dlt.common.normalizers.naming import snake_case from dlt.common.typing import DictStrAny, StrAny from dlt.common.utils import uniq_id -from dlt.common.schema import TColumnSchema, Schema, TStoredSchema, utils, TColumnHint +from dlt.common.schema import TColumnSchema, Schema, TStoredSchema, utils from dlt.common.schema.exceptions import ( InvalidSchemaName, ParentTableNotFoundException, @@ -28,50 +26,12 @@ ) from dlt.common.storages import SchemaStorage -from tests.utils import autouse_test_storage, preserve_environ from tests.common.utils import load_json_case, load_yml_case, COMMON_TEST_CASES_PATH SCHEMA_NAME = "event" EXPECTED_FILE_NAME = f"{SCHEMA_NAME}.schema.json" -@pytest.fixture -def schema_storage() -> SchemaStorage: - C = resolve_configuration( - SchemaStorageConfiguration(), - explicit_value={ - "import_schema_path": "tests/common/cases/schemas/rasa", - "external_schema_format": "json", - }, - ) - return SchemaStorage(C, makedirs=True) - - -@pytest.fixture -def schema_storage_no_import() -> SchemaStorage: - C = resolve_configuration(SchemaStorageConfiguration()) - return SchemaStorage(C, makedirs=True) - - -@pytest.fixture -def schema() -> Schema: - return Schema("event") - - -@pytest.fixture -def cn_schema() -> Schema: - return Schema( - "column_default", - { - "names": "tests.common.normalizers.custom_normalizers", - "json": { - "module": "tests.common.normalizers.custom_normalizers", - "config": {"not_null": ["fake_id"]}, - }, - }, - ) - - def test_normalize_schema_name(schema: Schema) -> None: assert schema.naming.normalize_table_identifier("BAN_ANA") == "ban_ana" assert schema.naming.normalize_table_identifier("event-.!:value") == "event_value" @@ -102,38 +62,6 @@ def test_new_schema(schema: Schema) -> None: utils.validate_stored_schema(stored_schema) -def test_new_schema_custom_normalizers(cn_schema: Schema) -> None: - assert_is_new_schema(cn_schema) - assert_new_schema_props_custom_normalizers(cn_schema) - - -def test_schema_config_normalizers(schema: Schema, schema_storage_no_import: SchemaStorage) -> None: - # save snake case schema - schema_storage_no_import.save_schema(schema) - # config direct naming convention - os.environ["SCHEMA__NAMING"] = "direct" - # new schema has direct naming convention - schema_direct_nc = Schema("direct_naming") - assert schema_direct_nc._normalizers_config["names"] == "direct" - # still after loading the config is "snake" - schema = schema_storage_no_import.load_schema(schema.name) - assert schema._normalizers_config["names"] == "snake_case" - # provide capabilities context - destination_caps = DestinationCapabilitiesContext.generic_capabilities() - destination_caps.naming_convention = "snake_case" - destination_caps.max_identifier_length = 127 - with Container().injectable_context(destination_caps): - # caps are ignored if schema is configured - schema_direct_nc = Schema("direct_naming") - assert schema_direct_nc._normalizers_config["names"] == "direct" - # but length is there - assert schema_direct_nc.naming.max_length == 127 - # also for loaded schema - schema = schema_storage_no_import.load_schema(schema.name) - assert schema._normalizers_config["names"] == "snake_case" - assert schema.naming.max_length == 127 - - def test_simple_regex_validator() -> None: # can validate only simple regexes assert utils.simple_regex_validator(".", "k", "v", str) is False @@ -394,33 +322,6 @@ def test_save_store_schema(schema: Schema, schema_storage: SchemaStorage) -> Non assert_new_schema_props(schema_copy) -def test_save_store_schema_custom_normalizers( - cn_schema: Schema, schema_storage: SchemaStorage -) -> None: - schema_storage.save_schema(cn_schema) - schema_copy = schema_storage.load_schema(cn_schema.name) - assert_new_schema_props_custom_normalizers(schema_copy) - - -def test_save_load_incomplete_column( - schema: Schema, schema_storage_no_import: SchemaStorage -) -> None: - # make sure that incomplete column is saved and restored without default hints - incomplete_col = utils.new_column("I", nullable=False) - incomplete_col["primary_key"] = True - incomplete_col["x-special"] = "spec" # type: ignore[typeddict-unknown-key] - table = utils.new_table("table", columns=[incomplete_col]) - schema.update_table(table) - schema_storage_no_import.save_schema(schema) - schema_copy = schema_storage_no_import.load_schema("event") - assert schema_copy.get_table("table")["columns"]["I"] == { - "name": "I", - "nullable": False, - "primary_key": True, - "x-special": "spec", - } - - def test_upgrade_engine_v1_schema() -> None: schema_dict: DictStrAny = load_json_case("schemas/ev1/event.schema") # ensure engine v1 @@ -479,7 +380,7 @@ def test_unknown_engine_upgrade() -> None: def test_preserve_column_order(schema: Schema, schema_storage: SchemaStorage) -> None: # python dicts are ordered from v3.6, add 50 column with random names update: List[TColumnSchema] = [ - schema._infer_column(uniq_id(), pendulum.now().timestamp()) for _ in range(50) + schema._infer_column("t" + uniq_id(), pendulum.now().timestamp()) for _ in range(50) ] schema.update_table(utils.new_table("event_test_order", columns=update)) @@ -496,7 +397,7 @@ def verify_items(table, update) -> None: verify_items(table, update) # add more columns update2: List[TColumnSchema] = [ - schema._infer_column(uniq_id(), pendulum.now().timestamp()) for _ in range(50) + schema._infer_column("t" + uniq_id(), pendulum.now().timestamp()) for _ in range(50) ] loaded_schema.update_table(utils.new_table("event_test_order", columns=update2)) table = loaded_schema.get_table_columns("event_test_order") @@ -648,6 +549,79 @@ def test_merge_hints(schema: Schema) -> None: for k in expected_hints: assert set(expected_hints[k]) == set(schema._settings["default_hints"][k]) # type: ignore[index] + # make sure that re:^_dlt_id$ and _dlt_id are equivalent when merging so we can use both forms + alt_form_hints = { + "not_null": ["re:^_dlt_id$"], + "foreign_key": ["_dlt_parent_id"], + } + schema.merge_hints(alt_form_hints) # type: ignore[arg-type] + # we keep the older forms so nothing changed + assert len(expected_hints) == len(schema._settings["default_hints"]) + for k in expected_hints: + assert set(expected_hints[k]) == set(schema._settings["default_hints"][k]) # type: ignore[index] + + # check normalize some regex forms + upper_hints = { + "not_null": [ + "_DLT_ID", + ], + "foreign_key": ["re:^_DLT_PARENT_ID$"], + } + schema.merge_hints(upper_hints) # type: ignore[arg-type] + # all upper form hints can be automatically converted to lower form + assert len(expected_hints) == len(schema._settings["default_hints"]) + for k in expected_hints: + assert set(expected_hints[k]) == set(schema._settings["default_hints"][k]) # type: ignore[index] + + # this form cannot be converted + upper_hints = { + "not_null": [ + "re:TU[b-b]a", + ], + } + schema.merge_hints(upper_hints) # type: ignore[arg-type] + assert "re:TU[b-b]a" in schema.settings["default_hints"]["not_null"] + + +def test_update_preferred_types(schema: Schema) -> None: + # no preferred types in the schema + assert "preferred_types" not in schema.settings + + expected: Dict[TSimpleRegex, TDataType] = { + TSimpleRegex("_dlt_id"): "bigint", + TSimpleRegex("re:^timestamp$"): "timestamp", + } + schema.update_preferred_types(expected) + assert schema.settings["preferred_types"] == expected + # no changes + schema.update_preferred_types(expected) + assert schema.settings["preferred_types"] == expected + + # add and replace, canonical form used to update / replace + updated: Dict[TSimpleRegex, TDataType] = { + TSimpleRegex("_dlt_id"): "decimal", + TSimpleRegex("timestamp"): "date", + TSimpleRegex("re:TU[b-c]a"): "text", + } + schema.update_preferred_types(updated) + assert schema.settings["preferred_types"] == { + "_dlt_id": "decimal", + "re:^timestamp$": "date", + "re:TU[b-c]a": "text", + } + + # will normalize some form of regex + updated = { + TSimpleRegex("_DLT_id"): "text", + TSimpleRegex("re:^TIMESTAMP$"): "timestamp", + } + schema.update_preferred_types(updated) + assert schema.settings["preferred_types"] == { + "_dlt_id": "text", + "re:^timestamp$": "timestamp", + "re:TU[b-c]a": "text", + } + def test_default_table_resource() -> None: """Parent tables without `resource` set default to table name""" @@ -766,9 +740,9 @@ def test_normalize_table_identifiers() -> None: assert "reactions___1" in schema.tables["issues"]["columns"] issues_table = deepcopy(schema.tables["issues"]) # this schema is already normalized so normalization is idempotent - assert schema.tables["issues"] == schema.normalize_table_identifiers(issues_table) - assert schema.tables["issues"] == schema.normalize_table_identifiers( - schema.normalize_table_identifiers(issues_table) + assert schema.tables["issues"] == utils.normalize_table_identifiers(issues_table, schema.naming) + assert schema.tables["issues"] == utils.normalize_table_identifiers( + utils.normalize_table_identifiers(issues_table, schema.naming), schema.naming ) @@ -780,7 +754,10 @@ def test_normalize_table_identifiers_merge_columns() -> None: ] # schema normalizing to snake case will conflict on case and Case table = utils.new_table("blend", columns=table_create) # type: ignore[arg-type] - norm_table = Schema("norm").normalize_table_identifiers(table) + table_str = json.dumps(table) + norm_table = utils.normalize_table_identifiers(table, Schema("norm").naming) + # nothing got changed in original table + assert table_str == json.dumps(table) # only one column assert len(norm_table["columns"]) == 1 assert norm_table["columns"]["case"] == { @@ -859,20 +836,21 @@ def test_group_tables_by_resource(schema: Schema) -> None: schema.update_table(utils.new_table("a_events", columns=[])) schema.update_table(utils.new_table("b_events", columns=[])) schema.update_table(utils.new_table("c_products", columns=[], resource="products")) - schema.update_table(utils.new_table("a_events__1", columns=[], parent_table_name="a_events")) + schema.update_table(utils.new_table("a_events___1", columns=[], parent_table_name="a_events")) schema.update_table( - utils.new_table("a_events__1__2", columns=[], parent_table_name="a_events__1") + utils.new_table("a_events___1___2", columns=[], parent_table_name="a_events___1") ) - schema.update_table(utils.new_table("b_events__1", columns=[], parent_table_name="b_events")) + schema.update_table(utils.new_table("b_events___1", columns=[], parent_table_name="b_events")) + # print(schema.to_pretty_yaml()) # All resources without filter expected_tables = { "a_events": [ schema.tables["a_events"], - schema.tables["a_events__1"], - schema.tables["a_events__1__2"], + schema.tables["a_events___1"], + schema.tables["a_events___1___2"], ], - "b_events": [schema.tables["b_events"], schema.tables["b_events__1"]], + "b_events": [schema.tables["b_events"], schema.tables["b_events___1"]], "products": [schema.tables["c_products"]], "_dlt_version": [schema.tables["_dlt_version"]], "_dlt_loads": [schema.tables["_dlt_loads"]], @@ -887,10 +865,10 @@ def test_group_tables_by_resource(schema: Schema) -> None: assert result == { "a_events": [ schema.tables["a_events"], - schema.tables["a_events__1"], - schema.tables["a_events__1__2"], + schema.tables["a_events___1"], + schema.tables["a_events___1___2"], ], - "b_events": [schema.tables["b_events"], schema.tables["b_events__1"]], + "b_events": [schema.tables["b_events"], schema.tables["b_events___1"]], } # With resources that has many top level tables @@ -919,3 +897,41 @@ def test_group_tables_by_resource(schema: Schema) -> None: {"columns": {}, "name": "mc_products__sub", "parent": "mc_products"}, ] } + + +def test_remove_processing_hints() -> None: + eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + # here tables contain processing hints + schema = Schema.from_dict(eth_V9) + assert "x-normalizer" in schema.tables["blocks"] + + # clone with hints removal, note that clone does not bump version + cloned = schema.clone(remove_processing_hints=True) + assert "x-normalizer" not in cloned.tables["blocks"] + # clone does not touch original schema + assert "x-normalizer" in schema.tables["blocks"] + + # to string + to_yaml = schema.to_pretty_yaml() + assert "x-normalizer" in to_yaml + to_yaml = schema.to_pretty_yaml(remove_processing_hints=True) + assert "x-normalizer" not in to_yaml + to_json = schema.to_pretty_json() + assert "x-normalizer" in to_json + to_json = schema.to_pretty_json(remove_processing_hints=True) + assert "x-normalizer" not in to_json + + # load without hints + no_hints = schema.from_dict(eth_V9, remove_processing_hints=True, bump_version=False) + assert no_hints.stored_version_hash == cloned.stored_version_hash + + # now load without hints but with version bump + cloned._bump_version() + no_hints = schema.from_dict(eth_V9, remove_processing_hints=True) + assert no_hints.stored_version_hash == cloned.stored_version_hash + + +# def test_get_new_table_columns() -> None: +# pytest.fail(reason="must implement!") +# pass +# get_new_table_columns() diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index b67b028161..788da09533 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -1,6 +1,5 @@ import pytest import yaml -from copy import deepcopy from dlt.common import json from dlt.common.schema import utils diff --git a/tests/common/storages/test_file_storage.py b/tests/common/storages/test_file_storage.py index eae765398b..7a10e29097 100644 --- a/tests/common/storages/test_file_storage.py +++ b/tests/common/storages/test_file_storage.py @@ -39,38 +39,40 @@ def test_to_relative_path(test_storage: FileStorage) -> None: def test_make_full_path(test_storage: FileStorage) -> None: # fully within storage relative_path = os.path.join("dir", "to", "file") - path = test_storage.make_full_path(relative_path) + path = test_storage.make_full_path_safe(relative_path) assert path.endswith(os.path.join(TEST_STORAGE_ROOT, relative_path)) # overlapped with storage root_path = os.path.join(TEST_STORAGE_ROOT, relative_path) - path = test_storage.make_full_path(root_path) + path = test_storage.make_full_path_safe(root_path) assert path.endswith(root_path) assert path.count(TEST_STORAGE_ROOT) == 2 # absolute path with different root than TEST_STORAGE_ROOT does not lead into storage so calculating full path impossible with pytest.raises(ValueError): - test_storage.make_full_path(os.path.join("/", root_path)) + test_storage.make_full_path_safe(os.path.join("/", root_path)) # relative path out of the root with pytest.raises(ValueError): - test_storage.make_full_path("..") + test_storage.make_full_path_safe("..") # absolute overlapping path - path = test_storage.make_full_path(os.path.abspath(root_path)) + path = test_storage.make_full_path_safe(os.path.abspath(root_path)) assert path.endswith(root_path) - assert test_storage.make_full_path("") == test_storage.storage_path - assert test_storage.make_full_path(".") == test_storage.storage_path + assert test_storage.make_full_path_safe("") == test_storage.storage_path + assert test_storage.make_full_path_safe(".") == test_storage.storage_path def test_in_storage(test_storage: FileStorage) -> None: # always relative to storage root - assert test_storage.in_storage("a/b/c") is True - assert test_storage.in_storage(f"../{TEST_STORAGE_ROOT}/b/c") is True - assert test_storage.in_storage("../a/b/c") is False - assert test_storage.in_storage("../../../a/b/c") is False - assert test_storage.in_storage("/a") is False - assert test_storage.in_storage(".") is True - assert test_storage.in_storage(os.curdir) is True - assert test_storage.in_storage(os.path.realpath(os.curdir)) is False + assert test_storage.is_path_in_storage("a/b/c") is True + assert test_storage.is_path_in_storage(f"../{TEST_STORAGE_ROOT}/b/c") is True + assert test_storage.is_path_in_storage("../a/b/c") is False + assert test_storage.is_path_in_storage("../../../a/b/c") is False + assert test_storage.is_path_in_storage("/a") is False + assert test_storage.is_path_in_storage(".") is True + assert test_storage.is_path_in_storage(os.curdir) is True + assert test_storage.is_path_in_storage(os.path.realpath(os.curdir)) is False assert ( - test_storage.in_storage(os.path.join(os.path.realpath(os.curdir), TEST_STORAGE_ROOT)) + test_storage.is_path_in_storage( + os.path.join(os.path.realpath(os.curdir), TEST_STORAGE_ROOT) + ) is True ) @@ -164,7 +166,7 @@ def test_rmtree_ro(test_storage: FileStorage) -> None: test_storage.create_folder("protected") path = test_storage.save("protected/barbapapa.txt", "barbapapa") os.chmod(path, stat.S_IREAD) - os.chmod(test_storage.make_full_path("protected"), stat.S_IREAD) + os.chmod(test_storage.make_full_path_safe("protected"), stat.S_IREAD) with pytest.raises(PermissionError): test_storage.delete_folder("protected", recursively=True, delete_ro=False) test_storage.delete_folder("protected", recursively=True, delete_ro=True) diff --git a/tests/common/storages/test_load_package.py b/tests/common/storages/test_load_package.py index ecbc5d296d..45bc8d157e 100644 --- a/tests/common/storages/test_load_package.py +++ b/tests/common/storages/test_load_package.py @@ -8,10 +8,8 @@ from dlt.common import sleep from dlt.common.schema import Schema from dlt.common.storages import PackageStorage, LoadStorage, ParsedLoadJobFileName +from dlt.common.storages.exceptions import LoadPackageAlreadyCompleted, LoadPackageNotCompleted from dlt.common.utils import uniq_id - -from tests.common.storages.utils import start_loading_file, assert_package_info, load_storage -from tests.utils import autouse_test_storage from dlt.common.pendulum import pendulum from dlt.common.configuration.container import Container from dlt.common.storages.load_package import ( @@ -23,6 +21,9 @@ clear_destination_state, ) +from tests.common.storages.utils import start_loading_file, assert_package_info, load_storage +from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage + def test_is_partially_loaded(load_storage: LoadStorage) -> None: load_id, file_name = start_loading_file( @@ -243,6 +244,177 @@ def test_build_parse_job_path(load_storage: LoadStorage) -> None: ParsedLoadJobFileName.parse("tab.id.wrong_retry.jsonl") +def test_load_package_listings(load_storage: LoadStorage) -> None: + # 100 csv files + load_id = create_load_package(load_storage.new_packages, 100) + new_jobs = load_storage.new_packages.list_new_jobs(load_id) + assert len(new_jobs) == 100 + assert len(load_storage.new_packages.list_job_with_states_for_table(load_id, "items_1")) == 100 + assert len(load_storage.new_packages.list_job_with_states_for_table(load_id, "items_2")) == 0 + assert len(load_storage.new_packages.list_all_jobs_with_states(load_id)) == 100 + assert len(load_storage.new_packages.list_started_jobs(load_id)) == 0 + assert len(load_storage.new_packages.list_failed_jobs(load_id)) == 0 + assert load_storage.new_packages.is_package_completed(load_id) is False + with pytest.raises(LoadPackageNotCompleted): + load_storage.new_packages.list_failed_jobs_infos(load_id) + # add a few more files + add_new_jobs(load_storage.new_packages, load_id, 7, "items_2") + assert len(load_storage.new_packages.list_job_with_states_for_table(load_id, "items_1")) == 100 + assert len(load_storage.new_packages.list_job_with_states_for_table(load_id, "items_2")) == 7 + j_w_s = load_storage.new_packages.list_all_jobs_with_states(load_id) + assert len(j_w_s) == 107 + assert all(job[0] == "new_jobs" for job in j_w_s) + with pytest.raises(FileNotFoundError): + load_storage.new_packages.get_job_failed_message(load_id, j_w_s[0][1]) + # get package infos + package_jobs = load_storage.new_packages.get_load_package_jobs(load_id) + assert len(package_jobs["new_jobs"]) == 107 + # other folders empty + assert len(package_jobs["started_jobs"]) == 0 + package_info = load_storage.new_packages.get_load_package_info(load_id) + assert len(package_info.jobs["new_jobs"]) == 107 + assert len(package_info.jobs["completed_jobs"]) == 0 + assert package_info.load_id == load_id + # full path + assert package_info.package_path == load_storage.new_packages.storage.make_full_path(load_id) + assert package_info.state == "new" + assert package_info.completed_at is None + + # move some files + new_jobs = sorted(load_storage.new_packages.list_new_jobs(load_id)) + load_storage.new_packages.start_job(load_id, os.path.basename(new_jobs[0])) + load_storage.new_packages.start_job(load_id, os.path.basename(new_jobs[1])) + load_storage.new_packages.start_job(load_id, os.path.basename(new_jobs[-1])) + load_storage.new_packages.start_job(load_id, os.path.basename(new_jobs[-2])) + + assert len(load_storage.new_packages.list_started_jobs(load_id)) == 4 + assert len(load_storage.new_packages.list_new_jobs(load_id)) == 103 + assert len(load_storage.new_packages.list_job_with_states_for_table(load_id, "items_1")) == 100 + assert len(load_storage.new_packages.list_job_with_states_for_table(load_id, "items_2")) == 7 + package_jobs = load_storage.new_packages.get_load_package_jobs(load_id) + assert len(package_jobs["new_jobs"]) == 103 + assert len(package_jobs["started_jobs"]) == 4 + package_info = load_storage.new_packages.get_load_package_info(load_id) + assert len(package_info.jobs["new_jobs"]) == 103 + assert len(package_info.jobs["started_jobs"]) == 4 + + # complete and fail some + load_storage.new_packages.complete_job(load_id, os.path.basename(new_jobs[0])) + load_storage.new_packages.fail_job(load_id, os.path.basename(new_jobs[1]), None) + load_storage.new_packages.fail_job(load_id, os.path.basename(new_jobs[-1]), "error!") + path = load_storage.new_packages.retry_job(load_id, os.path.basename(new_jobs[-2])) + assert ParsedLoadJobFileName.parse(path).retry_count == 1 + assert ( + load_storage.new_packages.get_job_failed_message( + load_id, ParsedLoadJobFileName.parse(new_jobs[1]) + ) + is None + ) + assert ( + load_storage.new_packages.get_job_failed_message( + load_id, ParsedLoadJobFileName.parse(new_jobs[-1]) + ) + == "error!" + ) + # can't move again + with pytest.raises(FileNotFoundError): + load_storage.new_packages.complete_job(load_id, os.path.basename(new_jobs[0])) + assert len(load_storage.new_packages.list_started_jobs(load_id)) == 0 + # retry back in new + assert len(load_storage.new_packages.list_new_jobs(load_id)) == 104 + package_jobs = load_storage.new_packages.get_load_package_jobs(load_id) + assert len(package_jobs["new_jobs"]) == 104 + assert len(package_jobs["started_jobs"]) == 0 + assert len(package_jobs["completed_jobs"]) == 1 + assert len(package_jobs["failed_jobs"]) == 2 + assert len(load_storage.new_packages.list_failed_jobs(load_id)) == 2 + package_info = load_storage.new_packages.get_load_package_info(load_id) + assert len(package_info.jobs["new_jobs"]) == 104 + assert len(package_info.jobs["started_jobs"]) == 0 + assert len(package_info.jobs["completed_jobs"]) == 1 + assert len(package_info.jobs["failed_jobs"]) == 2 + + # complete package + load_storage.new_packages.complete_loading_package(load_id, "aborted") + assert load_storage.new_packages.is_package_completed(load_id) + with pytest.raises(LoadPackageAlreadyCompleted): + load_storage.new_packages.complete_loading_package(load_id, "aborted") + + for job in package_info.jobs["failed_jobs"] + load_storage.new_packages.list_failed_jobs_infos( # type: ignore[operator] + load_id + ): + if job.job_file_info.table_name == "items_1": + assert job.failed_message is None + elif job.job_file_info.table_name == "items_2": + assert job.failed_message == "error!" + else: + raise AssertionError() + assert job.created_at is not None + assert job.elapsed is not None + assert job.file_size > 0 + assert job.state == "failed_jobs" + # must be abs path! + assert os.path.isabs(job.file_path) + + +def test_get_load_package_info_perf(load_storage: LoadStorage) -> None: + import time + + st_t = time.time() + for _ in range(10000): + load_storage.loaded_packages.storage.make_full_path("198291092.121/new/ABD.CX.gx") + # os.path.basename("198291092.121/new/ABD.CX.gx") + print(time.time() - st_t) + + st_t = time.time() + load_id = create_load_package(load_storage.loaded_packages, 10000) + print(time.time() - st_t) + + st_t = time.time() + # move half of the files to failed + for file_name in load_storage.loaded_packages.list_new_jobs(load_id)[:1000]: + load_storage.loaded_packages.start_job(load_id, os.path.basename(file_name)) + load_storage.loaded_packages.fail_job( + load_id, os.path.basename(file_name), f"FAILED {file_name}" + ) + print(time.time() - st_t) + + st_t = time.time() + load_storage.loaded_packages.get_load_package_info(load_id) + print(time.time() - st_t) + + st_t = time.time() + table_stat = {} + for file in load_storage.loaded_packages.list_new_jobs(load_id): + parsed = ParsedLoadJobFileName.parse(file) + table_stat[parsed.table_name] = parsed + print(time.time() - st_t) + + +def create_load_package( + package_storage: PackageStorage, new_jobs: int, table_name="items_1" +) -> str: + schema = Schema("test") + load_id = create_load_id() + package_storage.create_package(load_id) + package_storage.save_schema(load_id, schema) + add_new_jobs(package_storage, load_id, new_jobs, table_name) + return load_id + + +def add_new_jobs( + package_storage: PackageStorage, load_id: str, new_jobs: int, table_name="items_1" +) -> None: + for _ in range(new_jobs): + file_name = PackageStorage.build_job_file_name( + table_name, ParsedLoadJobFileName.new_file_id(), 0, False, "csv" + ) + file_path = os.path.join(TEST_STORAGE_ROOT, file_name) + with open(file_path, "wt", encoding="utf-8") as f: + f.write("a|b|c") + package_storage.import_job(load_id, file_path) + + def test_migrate_to_load_package_state() -> None: """ Here we test that an existing load package without a state will not error diff --git a/tests/common/storages/test_load_storage.py b/tests/common/storages/test_load_storage.py index e8686ac2f9..49deaff23e 100644 --- a/tests/common/storages/test_load_storage.py +++ b/tests/common/storages/test_load_storage.py @@ -33,7 +33,7 @@ def test_complete_successful_package(load_storage: LoadStorage) -> None: # but completed packages are deleted load_storage.maybe_remove_completed_jobs(load_id) assert not load_storage.loaded_packages.storage.has_folder( - load_storage.loaded_packages.get_job_folder_path(load_id, "completed_jobs") + load_storage.loaded_packages.get_job_state_folder_path(load_id, "completed_jobs") ) assert_package_info(load_storage, load_id, "loaded", "completed_jobs", jobs_count=0) # delete completed package @@ -56,7 +56,7 @@ def test_complete_successful_package(load_storage: LoadStorage) -> None: ) # has completed loads assert load_storage.loaded_packages.storage.has_folder( - load_storage.loaded_packages.get_job_folder_path(load_id, "completed_jobs") + load_storage.loaded_packages.get_job_state_folder_path(load_id, "completed_jobs") ) load_storage.delete_loaded_package(load_id) assert not load_storage.storage.has_folder(load_storage.get_loaded_package_path(load_id)) @@ -82,14 +82,14 @@ def test_complete_package_failed_jobs(load_storage: LoadStorage) -> None: assert load_storage.storage.has_folder(load_storage.get_loaded_package_path(load_id)) # has completed loads assert load_storage.loaded_packages.storage.has_folder( - load_storage.loaded_packages.get_job_folder_path(load_id, "completed_jobs") + load_storage.loaded_packages.get_job_state_folder_path(load_id, "completed_jobs") ) assert_package_info(load_storage, load_id, "loaded", "failed_jobs") # get failed jobs info failed_files = sorted(load_storage.loaded_packages.list_failed_jobs(load_id)) - # job + message - assert len(failed_files) == 2 + # only jobs + assert len(failed_files) == 1 assert load_storage.loaded_packages.storage.has_file(failed_files[0]) failed_info = load_storage.list_failed_jobs_in_loaded_package(load_id) assert failed_info[0].file_path == load_storage.loaded_packages.storage.make_full_path( @@ -117,7 +117,7 @@ def test_abort_package(load_storage: LoadStorage) -> None: assert_package_info(load_storage, load_id, "normalized", "failed_jobs") load_storage.complete_load_package(load_id, True) assert load_storage.loaded_packages.storage.has_folder( - load_storage.loaded_packages.get_job_folder_path(load_id, "completed_jobs") + load_storage.loaded_packages.get_job_state_folder_path(load_id, "completed_jobs") ) assert_package_info(load_storage, load_id, "aborted", "failed_jobs") diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index e97fac8a9e..ffbd2ecf1b 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -1,12 +1,10 @@ import os -import shutil import pytest import yaml from dlt.common import json -from dlt.common.normalizers import explicit_normalizers +from dlt.common.normalizers.utils import explicit_normalizers from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import TStoredSchema from dlt.common.storages.exceptions import ( InStorageSchemaModified, SchemaNotFoundError, @@ -20,9 +18,9 @@ ) from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT +from tests.common.storages.utils import prepare_eth_import_folder from tests.common.utils import ( load_yml_case, - yml_case_path, COMMON_TEST_CASES_PATH, IMPORTED_VERSION_HASH_ETH_V9, ) @@ -234,7 +232,7 @@ def test_getter(storage: SchemaStorage) -> None: def test_getter_with_import(ie_storage: SchemaStorage) -> None: with pytest.raises(KeyError): ie_storage["ethereum"] - prepare_import_folder(ie_storage) + prepare_eth_import_folder(ie_storage) # schema will be imported schema = ie_storage["ethereum"] assert schema.name == "ethereum" @@ -260,17 +258,17 @@ def test_getter_with_import(ie_storage: SchemaStorage) -> None: def test_save_store_schema_over_import(ie_storage: SchemaStorage) -> None: - prepare_import_folder(ie_storage) + prepare_eth_import_folder(ie_storage) # we have ethereum schema to be imported but we create new schema and save it schema = Schema("ethereum") schema_hash = schema.version_hash ie_storage.save_schema(schema) assert schema.version_hash == schema_hash # we linked schema to import schema - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9() # load schema and make sure our new schema is here schema = ie_storage.load_schema("ethereum") - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9() assert schema._stored_version_hash == schema_hash assert schema.version_hash == schema_hash assert schema.previous_hashes == [] @@ -283,11 +281,11 @@ def test_save_store_schema_over_import(ie_storage: SchemaStorage) -> None: def test_save_store_schema_over_import_sync(synced_storage: SchemaStorage) -> None: # as in test_save_store_schema_over_import but we export the new schema immediately to overwrite the imported schema - prepare_import_folder(synced_storage) + prepare_eth_import_folder(synced_storage) schema = Schema("ethereum") schema_hash = schema.version_hash synced_storage.save_schema(schema) - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9() # import schema is overwritten fs = FileStorage(synced_storage.config.import_schema_path) exported_name = synced_storage._file_name_in_store("ethereum", "yaml") @@ -353,6 +351,28 @@ def test_schema_from_file() -> None: ) +def test_save_initial_import_schema(ie_storage: LiveSchemaStorage) -> None: + # no schema in regular storage + with pytest.raises(SchemaNotFoundError): + ie_storage.load_schema("ethereum") + + # save initial import schema where processing hints are removed + eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + schema = Schema.from_dict(eth_V9) + ie_storage.save_import_schema_if_not_exists(schema) + # should be available now + eth = ie_storage.load_schema("ethereum") + assert "x-normalizer" not in eth.tables["blocks"] + + # won't overwrite initial schema + del eth_V9["tables"]["blocks__uncles"] + schema = Schema.from_dict(eth_V9) + ie_storage.save_import_schema_if_not_exists(schema) + # should be available now + eth = ie_storage.load_schema("ethereum") + assert "blocks__uncles" in eth.tables + + def test_live_schema_instances(live_storage: LiveSchemaStorage) -> None: schema = Schema("simple") live_storage.save_schema(schema) @@ -474,22 +494,14 @@ def test_new_live_schema_committed(live_storage: LiveSchemaStorage) -> None: # assert schema.settings["schema_sealed"] is True -def prepare_import_folder(storage: SchemaStorage) -> None: - shutil.copy( - yml_case_path("schemas/eth/ethereum_schema_v8"), - os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml"), - ) - - def assert_schema_imported(synced_storage: SchemaStorage, storage: SchemaStorage) -> Schema: - prepare_import_folder(synced_storage) - eth_V9: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v9") + prepare_eth_import_folder(synced_storage) schema = synced_storage.load_schema("ethereum") # is linked to imported schema - schema._imported_version_hash = eth_V9["version_hash"] + schema._imported_version_hash = IMPORTED_VERSION_HASH_ETH_V9() # also was saved in storage assert synced_storage.has_schema("ethereum") - # and has link to imported schema s well (load without import) + # and has link to imported schema as well (load without import) schema = storage.load_schema("ethereum") - assert schema._imported_version_hash == eth_V9["version_hash"] + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9() return schema diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index 3bfc3374a4..1b5a68948b 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -21,9 +21,12 @@ ) from dlt.common.storages import DataItemStorage, FileStorage from dlt.common.storages.fsspec_filesystem import FileItem, FileItemDict +from dlt.common.storages.schema_storage import SchemaStorage from dlt.common.typing import StrAny, TDataItems from dlt.common.utils import uniq_id +from tests.common.utils import load_yml_case + TEST_SAMPLE_FILES = "tests/common/storages/samples" MINIMALLY_EXPECTED_RELATIVE_PATHS = { "csv/freshman_kgs.csv", @@ -199,3 +202,12 @@ def assert_package_info( # get dict package_info.asdict() return package_info + + +def prepare_eth_import_folder(storage: SchemaStorage) -> Schema: + eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + # remove processing hints before installing as import schema + # ethereum schema is a "dirty" schema with processing hints + eth = Schema.from_dict(eth_V9, remove_processing_hints=True) + storage._export_schema(eth, storage.config.import_schema_path) + return eth diff --git a/tests/common/test_destination.py b/tests/common/test_destination.py index 24b0928463..2c690d94bb 100644 --- a/tests/common/test_destination.py +++ b/tests/common/test_destination.py @@ -1,10 +1,13 @@ +from typing import Dict import pytest from dlt.common.destination.reference import DestinationClientDwhConfiguration, Destination from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.exceptions import InvalidDestinationReference, UnknownDestinationModule from dlt.common.schema import Schema +from dlt.common.typing import is_subclass +from tests.common.configuration.utils import environment from tests.utils import ACTIVE_DESTINATIONS @@ -32,6 +35,96 @@ def test_custom_destination_module() -> None: ) # a full type name +def test_arguments_propagated_to_config() -> None: + dest = Destination.from_reference( + "dlt.destinations.duckdb", create_indexes=None, unknown_param="A" + ) + # None for create_indexes is not a default and it is passed on, unknown_param is removed because it is unknown + assert dest.config_params == {"create_indexes": None} + assert dest.caps_params == {} + + # test explicit config value being passed + import dlt + + dest = Destination.from_reference( + "dlt.destinations.duckdb", create_indexes=dlt.config.value, unknown_param="A" + ) + assert dest.config_params == {"create_indexes": dlt.config.value} + assert dest.caps_params == {} + + dest = Destination.from_reference( + "dlt.destinations.weaviate", naming_convention="duck_case", create_indexes=True + ) + # create indexes are not known + assert dest.config_params == {} + + # create explicit caps + dest = Destination.from_reference( + "dlt.destinations.dummy", + naming_convention="duck_case", + recommended_file_size=4000000, + loader_file_format="parquet", + ) + from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration + + assert dest.config_params == {"loader_file_format": "parquet"} + # loader_file_format is a legacy param that is duplicated as preferred_loader_file_format + assert dest.caps_params == { + "naming_convention": "duck_case", + "recommended_file_size": 4000000, + } + # instantiate configs + caps = dest.capabilities() + assert caps.naming_convention == "duck_case" + assert caps.preferred_loader_file_format == "parquet" + assert caps.recommended_file_size == 4000000 + init_config = DummyClientConfiguration() + config = dest.configuration(init_config) + assert config.loader_file_format == "parquet" # type: ignore[attr-defined] + + +def test_factory_config_injection(environment: Dict[str, str]) -> None: + environment["DESTINATION__LOADER_FILE_FORMAT"] = "parquet" + from dlt.destinations import dummy + + # caps will resolve from config without client + assert dummy().capabilities().preferred_loader_file_format == "parquet" + + caps = dummy().client(Schema("client")).capabilities + assert caps.preferred_loader_file_format == "parquet" + + environment.clear() + caps = dummy().client(Schema("client")).capabilities + assert caps.preferred_loader_file_format == "jsonl" + + environment["DESTINATION__DUMMY__LOADER_FILE_FORMAT"] = "parquet" + environment["DESTINATION__DUMMY__FAIL_PROB"] = "0.435" + + # config will partially resolve without client + config = dummy().configuration(None, accept_partial=True) + assert config.fail_prob == 0.435 + assert config.loader_file_format == "parquet" + + dummy_ = dummy().client(Schema("client")) + assert dummy_.capabilities.preferred_loader_file_format == "parquet" + assert dummy_.config.fail_prob == 0.435 + + # test named destination + environment.clear() + import os + from dlt.destinations import filesystem + from dlt.destinations.impl.filesystem.configuration import ( + FilesystemDestinationClientConfiguration, + ) + + filesystem_ = filesystem(destination_name="local") + abs_path = os.path.abspath("_storage") + environment["DESTINATION__LOCAL__BUCKET_URL"] = abs_path + init_config = FilesystemDestinationClientConfiguration()._bind_dataset_name(dataset_name="test") + configured_bucket_url = filesystem_.client(Schema("test"), init_config).config.bucket_url + assert configured_bucket_url.endswith("_storage") + + def test_import_module_by_path() -> None: # importing works directly from dlt destinations dest = Destination.from_reference("dlt.destinations.postgres") @@ -54,17 +147,7 @@ def test_import_module_by_path() -> None: def test_import_all_destinations() -> None: # this must pass without the client dependencies being imported for dest_type in ACTIVE_DESTINATIONS: - # generic destination needs a valid callable, otherwise instantiation will fail - additional_args = {} - if dest_type == "destination": - - def dest_callable(items, table) -> None: - pass - - additional_args["destination_callable"] = dest_callable - dest = Destination.from_reference( - dest_type, None, dest_type + "_name", "production", **additional_args - ) + dest = Destination.from_reference(dest_type, None, dest_type + "_name", "production") assert dest.destination_type == "dlt.destinations." + dest_type assert dest.destination_name == dest_type + "_name" assert dest.config_params["environment"] == "production" @@ -73,6 +156,44 @@ def dest_callable(items, table) -> None: assert isinstance(dest.capabilities(), DestinationCapabilitiesContext) +def test_instantiate_all_factories() -> None: + from dlt import destinations + + impls = dir(destinations) + for impl in impls: + var_ = getattr(destinations, impl) + if not is_subclass(var_, Destination): + continue + dest = var_() + + assert dest.destination_name + assert dest.destination_type + # custom destination is named after the callable + if dest.destination_type != "dlt.destinations.destination": + assert dest.destination_type.endswith(dest.destination_name) + else: + assert dest.destination_name == "dummy_custom_destination" + assert dest.spec + assert dest.spec() + # partial configuration may always be created + init_config = dest.spec.credentials_type()() + init_config.__is_resolved__ = True + assert dest.configuration(init_config, accept_partial=True) + assert dest.capabilities() + + mod_dest = var_( + destination_name="fake_name", environment="prod", naming_convention="duck_case" + ) + assert ( + mod_dest.config_params.items() + >= {"destination_name": "fake_name", "environment": "prod"}.items() + ) + assert mod_dest.caps_params == {"naming_convention": "duck_case"} + assert mod_dest.destination_name == "fake_name" + caps = mod_dest.capabilities() + assert caps.naming_convention == "duck_case" + + def test_import_destination_config() -> None: # importing destination by type will work dest = Destination.from_reference(ref="dlt.destinations.duckdb", environment="stage") @@ -97,6 +218,7 @@ def test_import_destination_config() -> None: ref="duckdb", destination_name="my_destination", environment="devel" ) assert dest.destination_type == "dlt.destinations.duckdb" + assert dest.destination_name == "my_destination" assert dest.config_params["environment"] == "devel" config = dest.configuration(dest.spec()._bind_dataset_name(dataset_name="dataset")) # type: ignore assert config.destination_type == "duckdb" diff --git a/tests/common/utils.py b/tests/common/utils.py index a234937e56..32741128b8 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -9,14 +9,24 @@ from dlt.common import json from dlt.common.typing import StrAny -from dlt.common.schema import utils +from dlt.common.schema import utils, Schema from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.configuration.providers import environ as environ_provider COMMON_TEST_CASES_PATH = "./tests/common/cases/" -# for import schema tests, change when upgrading the schema version -IMPORTED_VERSION_HASH_ETH_V9 = "PgEHvn5+BHV1jNzNYpx9aDpq6Pq1PSSetufj/h0hKg4=" + + +def IMPORTED_VERSION_HASH_ETH_V9() -> str: + # for import schema tests, change when upgrading the schema version + eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + assert eth_V9["version_hash"] == "PgEHvn5+BHV1jNzNYpx9aDpq6Pq1PSSetufj/h0hKg4=" + # remove processing hints before installing as import schema + # ethereum schema is a "dirty" schema with processing hints + eth = Schema.from_dict(eth_V9, remove_processing_hints=True) + return eth.stored_version_hash + + # test sentry DSN TEST_SENTRY_DSN = ( "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" diff --git a/tests/destinations/test_custom_destination.py b/tests/destinations/test_custom_destination.py index 6834006689..6ebf7f6ef3 100644 --- a/tests/destinations/test_custom_destination.py +++ b/tests/destinations/test_custom_destination.py @@ -8,12 +8,13 @@ from copy import deepcopy from dlt.common.configuration.specs.base_configuration import configspec +from dlt.common.schema.schema import Schema from dlt.common.typing import TDataItems from dlt.common.schema import TTableSchema from dlt.common.data_writers.writers import TLoaderFileFormat from dlt.common.destination.reference import Destination from dlt.common.destination.exceptions import InvalidDestinationReference -from dlt.common.configuration.exceptions import ConfigFieldMissingException +from dlt.common.configuration.exceptions import ConfigFieldMissingException, ConfigurationValueError from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.configuration.inject import get_fun_spec from dlt.common.configuration.specs import BaseConfiguration @@ -38,7 +39,7 @@ def _run_through_sink( batch_size: int = 10, ) -> List[Tuple[TDataItems, TTableSchema]]: """ - runs a list of items through the sink destination and returns colleceted calls + runs a list of items through the sink destination and returns collected calls """ calls: List[Tuple[TDataItems, TTableSchema]] = [] @@ -55,7 +56,7 @@ def items_resource() -> TDataItems: nonlocal items yield items - p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + p = dlt.pipeline("sink_test", destination=test_sink, dev_mode=True) p.run([items_resource()]) return calls @@ -126,6 +127,34 @@ def global_sink_func(items: TDataItems, table: TTableSchema) -> None: global_calls.append((items, table)) +def test_capabilities() -> None: + # test default caps + dest = dlt.destination()(global_sink_func)() + caps = dest.capabilities() + assert caps.preferred_loader_file_format == "typed-jsonl" + assert caps.supported_loader_file_formats == ["typed-jsonl", "parquet"] + assert caps.naming_convention == "direct" + assert caps.max_table_nesting == 0 + client_caps = dest.client(Schema("schema")).capabilities + assert dict(caps) == dict(client_caps) + + # test modified caps + dest = dlt.destination( + loader_file_format="parquet", + batch_size=0, + name="my_name", + naming_convention="snake_case", + max_table_nesting=10, + )(global_sink_func)() + caps = dest.capabilities() + assert caps.preferred_loader_file_format == "parquet" + assert caps.supported_loader_file_formats == ["typed-jsonl", "parquet"] + assert caps.naming_convention == "snake_case" + assert caps.max_table_nesting == 10 + client_caps = dest.client(Schema("schema")).capabilities + assert dict(caps) == dict(client_caps) + + def test_instantiation() -> None: # also tests _DESTINATIONS calls: List[Tuple[TDataItems, TTableSchema]] = [] @@ -140,23 +169,23 @@ def local_sink_func(items: TDataItems, table: TTableSchema, my_val=dlt.config.va # test decorator calls = [] - p = dlt.pipeline("sink_test", destination=dlt.destination()(local_sink_func), full_refresh=True) + p = dlt.pipeline("sink_test", destination=dlt.destination()(local_sink_func), dev_mode=True) p.run([1, 2, 3], table_name="items") assert len(calls) == 1 # local func does not create entry in destinations - assert not _DESTINATIONS + assert "local_sink_func" not in _DESTINATIONS # test passing via from_reference calls = [] p = dlt.pipeline( "sink_test", destination=Destination.from_reference("destination", destination_callable=local_sink_func), - full_refresh=True, + dev_mode=True, ) p.run([1, 2, 3], table_name="items") assert len(calls) == 1 # local func does not create entry in destinations - assert not _DESTINATIONS + assert "local_sink_func" not in _DESTINATIONS # test passing string reference global global_calls @@ -167,7 +196,7 @@ def local_sink_func(items: TDataItems, table: TTableSchema, my_val=dlt.config.va "destination", destination_callable="tests.destinations.test_custom_destination.global_sink_func", ), - full_refresh=True, + dev_mode=True, ) p.run([1, 2, 3], table_name="items") assert len(global_calls) == 1 @@ -182,9 +211,9 @@ def local_sink_func(items: TDataItems, table: TTableSchema, my_val=dlt.config.va p = dlt.pipeline( "sink_test", destination=Destination.from_reference("destination", destination_callable=None), - full_refresh=True, + dev_mode=True, ) - with pytest.raises(PipelineStepFailed): + with pytest.raises(ConfigurationValueError): p.run([1, 2, 3], table_name="items") # pass invalid string reference will fail on instantiation @@ -194,7 +223,7 @@ def local_sink_func(items: TDataItems, table: TTableSchema, my_val=dlt.config.va destination=Destination.from_reference( "destination", destination_callable="does.not.exist" ), - full_refresh=True, + dev_mode=True, ) # using decorator without args will also work @@ -206,7 +235,7 @@ def simple_decorator_sink(items, table, my_val=dlt.config.value): assert my_val == "something" calls.append((items, table)) - p = dlt.pipeline("sink_test", destination=simple_decorator_sink, full_refresh=True) # type: ignore + p = dlt.pipeline("sink_test", destination=simple_decorator_sink, dev_mode=True) # type: ignore p.run([1, 2, 3], table_name="items") assert len(calls) == 1 @@ -265,7 +294,7 @@ def assert_items_in_range(c: List[TDataItems], start: int, end: int) -> None: assert str(i) in collected_items # no errors are set, all items should be processed - p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + p = dlt.pipeline("sink_test", destination=test_sink, dev_mode=True) load_id = p.run([items(), items2()]).loads_ids[0] assert_items_in_range(calls["items"], 0, 100) assert_items_in_range(calls["items2"], 0, 100) @@ -278,7 +307,7 @@ def assert_items_in_range(c: List[TDataItems], start: int, end: int) -> None: # provoke errors calls = {} provoke_error = {"items": 25, "items2": 45} - p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + p = dlt.pipeline("sink_test", destination=test_sink, dev_mode=True) with pytest.raises(PipelineStepFailed): p.run([items(), items2()]) @@ -335,7 +364,7 @@ def snake_sink(items, table): assert table["columns"]["snake_case"]["name"] == "snake_case" assert table["columns"]["camel_case"]["name"] == "camel_case" - dlt.pipeline("sink_test", destination=snake_sink, full_refresh=True).run(resource()) + dlt.pipeline("sink_test", destination=snake_sink, dev_mode=True).run(resource()) # check default (which is direct) @dlt.destination() @@ -345,7 +374,7 @@ def direct_sink(items, table): assert table["columns"]["snake_case"]["name"] == "snake_case" assert table["columns"]["camelCase"]["name"] == "camelCase" - dlt.pipeline("sink_test", destination=direct_sink, full_refresh=True).run(resource()) + dlt.pipeline("sink_test", destination=direct_sink, dev_mode=True).run(resource()) def test_file_batch() -> None: @@ -368,7 +397,7 @@ def direct_sink(file_path, table): with pyarrow.parquet.ParquetFile(file_path) as reader: assert reader.metadata.num_rows == (100 if table["name"] == "person" else 50) - dlt.pipeline("sink_test", destination=direct_sink, full_refresh=True).run( + dlt.pipeline("sink_test", destination=direct_sink, dev_mode=True).run( [resource1(), resource2()] ) @@ -384,25 +413,23 @@ def my_sink(file_path, table, my_val=dlt.config.value): # if no value is present, it should raise with pytest.raises(ConfigFieldMissingException): - dlt.pipeline("sink_test", destination=my_sink, full_refresh=True).run( + dlt.pipeline("sink_test", destination=my_sink, dev_mode=True).run( [1, 2, 3], table_name="items" ) # we may give the value via __callable__ function - dlt.pipeline("sink_test", destination=my_sink(my_val="something"), full_refresh=True).run( + dlt.pipeline("sink_test", destination=my_sink(my_val="something"), dev_mode=True).run( [1, 2, 3], table_name="items" ) # right value will pass os.environ["DESTINATION__MY_SINK__MY_VAL"] = "something" - dlt.pipeline("sink_test", destination=my_sink, full_refresh=True).run( - [1, 2, 3], table_name="items" - ) + dlt.pipeline("sink_test", destination=my_sink, dev_mode=True).run([1, 2, 3], table_name="items") # wrong value will raise os.environ["DESTINATION__MY_SINK__MY_VAL"] = "wrong" with pytest.raises(PipelineStepFailed): - dlt.pipeline("sink_test", destination=my_sink, full_refresh=True).run( + dlt.pipeline("sink_test", destination=my_sink, dev_mode=True).run( [1, 2, 3], table_name="items" ) @@ -413,13 +440,13 @@ def other_sink(file_path, table, my_val=dlt.config.value): # if no value is present, it should raise with pytest.raises(ConfigFieldMissingException): - dlt.pipeline("sink_test", destination=other_sink, full_refresh=True).run( + dlt.pipeline("sink_test", destination=other_sink, dev_mode=True).run( [1, 2, 3], table_name="items" ) # right value will pass os.environ["DESTINATION__SOME_NAME__MY_VAL"] = "something" - dlt.pipeline("sink_test", destination=other_sink, full_refresh=True).run( + dlt.pipeline("sink_test", destination=other_sink, dev_mode=True).run( [1, 2, 3], table_name="items" ) @@ -437,7 +464,7 @@ def my_gcp_sink( # missing spec with pytest.raises(ConfigFieldMissingException): - dlt.pipeline("sink_test", destination=my_gcp_sink, full_refresh=True).run( + dlt.pipeline("sink_test", destination=my_gcp_sink, dev_mode=True).run( [1, 2, 3], table_name="items" ) @@ -447,7 +474,7 @@ def my_gcp_sink( os.environ["CREDENTIALS__USERNAME"] = "my_user_name" # now it will run - dlt.pipeline("sink_test", destination=my_gcp_sink, full_refresh=True).run( + dlt.pipeline("sink_test", destination=my_gcp_sink, dev_mode=True).run( [1, 2, 3], table_name="items" ) @@ -471,14 +498,14 @@ def sink_func_with_spec( # call fails because `my_predefined_val` is required part of spec, even if not injected with pytest.raises(ConfigFieldMissingException): - info = dlt.pipeline("sink_test", destination=sink_func_with_spec(), full_refresh=True).run( + info = dlt.pipeline("sink_test", destination=sink_func_with_spec(), dev_mode=True).run( [1, 2, 3], table_name="items" ) info.raise_on_failed_jobs() # call happens now os.environ["MY_PREDEFINED_VAL"] = "VAL" - info = dlt.pipeline("sink_test", destination=sink_func_with_spec(), full_refresh=True).run( + info = dlt.pipeline("sink_test", destination=sink_func_with_spec(), dev_mode=True).run( [1, 2, 3], table_name="items" ) info.raise_on_failed_jobs() @@ -550,7 +577,7 @@ def test_sink(items, table): found_dlt_column_value = True # test with and without removing - p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + p = dlt.pipeline("sink_test", destination=test_sink, dev_mode=True) p.run([{"id": 1, "value": "1"}], table_name="some_table") assert found_dlt_column != remove_stuff @@ -579,7 +606,7 @@ def nesting_sink(items, table): def source(): yield dlt.resource(data, name="data") - p = dlt.pipeline("sink_test_max_nesting", destination=nesting_sink, full_refresh=True) + p = dlt.pipeline("sink_test_max_nesting", destination=nesting_sink, dev_mode=True) p.run(source()) # fall back to source setting diff --git a/tests/extract/data_writers/test_buffered_writer.py b/tests/extract/data_writers/test_buffered_writer.py index b6da132de9..5cad5a35b9 100644 --- a/tests/extract/data_writers/test_buffered_writer.py +++ b/tests/extract/data_writers/test_buffered_writer.py @@ -264,6 +264,27 @@ def test_import_file(writer_type: Type[DataWriter]) -> None: assert metrics.file_size == 231 +@pytest.mark.parametrize("writer_type", ALL_WRITERS) +def test_import_file_with_extension(writer_type: Type[DataWriter]) -> None: + now = time.time() + with get_writer(writer_type) as writer: + # won't destroy the original + metrics = writer.import_file( + "tests/extract/cases/imported.any", + DataWriterMetrics("", 1, 231, 0, 0), + with_extension="any", + ) + assert len(writer.closed_files) == 1 + assert os.path.isfile(metrics.file_path) + # extension is correctly set + assert metrics.file_path.endswith(".any") + assert writer.closed_files[0] == metrics + assert metrics.created <= metrics.last_modified + assert metrics.created >= now + assert metrics.items_count == 1 + assert metrics.file_size == 231 + + @pytest.mark.parametrize( "disable_compression", [True, False], ids=["no_compression", "compression"] ) diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index db888c95e4..f9775fd218 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -42,7 +42,7 @@ ) from dlt.extract.items import TableNameMeta -from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V9 +from tests.common.utils import load_yml_case def test_default_resource() -> None: @@ -107,7 +107,10 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash - assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V9 + eth_v9 = load_yml_case("schemas/eth/ethereum_schema_v9") + # source removes processing hints so we do + reference_schema = Schema.from_dict(eth_v9, remove_processing_hints=True) + assert schema.stored_version_hash == reference_schema.stored_version_hash def test_unbound_parametrized_transformer() -> None: @@ -341,6 +344,41 @@ class Columns3(BaseModel): assert t["columns"]["b"]["data_type"] == "double" +def test_not_normalized_identifiers_in_hints() -> None: + @dlt.resource( + primary_key="ID", + merge_key=["Month", "Day"], + columns=[{"name": "Col1", "data_type": "bigint"}], + table_name="🐫Camels", + ) + def CamelResource(): + yield ["🐫"] * 10 + + camels = CamelResource() + # original names are kept + assert camels.name == "CamelResource" + assert camels.table_name == "🐫Camels" + assert camels.columns == {"Col1": {"data_type": "bigint", "name": "Col1"}} + table = camels.compute_table_schema() + columns = table["columns"] + assert "ID" in columns + assert "Month" in columns + assert "Day" in columns + assert "Col1" in columns + assert table["name"] == "🐫Camels" + + # define as part of a source + camel_source = DltSource(Schema("snake_case"), "camel_section", [camels]) + schema = camel_source.discover_schema() + # all normalized + table = schema.get_table("_camels") + columns = table["columns"] + assert "id" in columns + assert "month" in columns + assert "day" in columns + assert "col1" in columns + + def test_resource_name_from_generator() -> None: def some_data(): yield [1, 2, 3] @@ -565,6 +603,21 @@ def created_global(): _assert_source_schema(created_global(), "global") +def test_source_schema_removes_processing_hints() -> None: + eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + assert "x-normalizer" in eth_V9["tables"]["blocks"] + + @dlt.source(schema=Schema.from_dict(eth_V9)) + def created_explicit(): + schema = dlt.current.source_schema() + assert schema.name == "ethereum" + assert "x-normalizer" not in schema.tables["blocks"] + return dlt.resource([1, 2, 3], name="res") + + source = created_explicit() + assert "x-normalizer" not in source.schema.tables["blocks"] + + def test_source_state_context() -> None: @dlt.resource(selected=False) def main(): @@ -849,6 +902,18 @@ def test_standalone_transformer(next_item_mode: str) -> None: ] +def test_transformer_required_args() -> None: + @dlt.transformer + def path_params(id_, workspace_id, load_id, base: bool = False): + yield {"id": id_, "workspace_id": workspace_id, "load_id": load_id} + + data = list([1, 2, 3] | path_params(121, 343)) + assert len(data) == 3 + assert data[0] == {"id": 1, "workspace_id": 121, "load_id": 343} + + # @dlt + + @dlt.transformer(standalone=True, name=lambda args: args["res_name"]) def standalone_tx_with_name(item: TDataItem, res_name: str, init: int = dlt.config.value): return res_name * item * init diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index dc978b997a..dbec417f97 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -125,6 +125,7 @@ def with_table_hints(): {"id": 1, "pk2": "B"}, make_hints( write_disposition="merge", + file_format="preferred", columns=[{"name": "id", "precision": 16}, {"name": "text", "data_type": "decimal"}], primary_key="pk2", ), @@ -143,6 +144,7 @@ def with_table_hints(): assert "pk" in table["columns"] assert "text" in table["columns"] assert table["write_disposition"] == "merge" + assert table["file_format"] == "preferred" # make table name dynamic yield dlt.mark.with_hints( diff --git a/tests/extract/test_extract_pipe.py b/tests/extract/test_extract_pipe.py index 9bf580b76a..d285181c55 100644 --- a/tests/extract/test_extract_pipe.py +++ b/tests/extract/test_extract_pipe.py @@ -510,6 +510,19 @@ def test_pipe_copy_on_fork() -> None: assert elems[0].item is not elems[1].item +def test_pipe_pass_empty_list() -> None: + def _gen(): + yield [] + + pipe = Pipe.from_data("data", _gen()) + elems = list(PipeIterator.from_pipe(pipe)) + assert elems[0].item == [] + + pipe = Pipe.from_data("data", [[]]) + elems = list(PipeIterator.from_pipe(pipe)) + assert elems[0].item == [] + + def test_clone_single_pipe() -> None: doc = {"e": 1, "l": 2} parent = Pipe.from_data("data", [doc]) diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 7b2613776d..8287da69d4 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -1274,6 +1274,8 @@ def empty_gen(): primary_key=["a", "b"], merge_key=["c", "a"], schema_contract="freeze", + table_format="delta", + file_format="jsonl", ) table = empty_r.compute_table_schema() assert table["columns"]["a"] == { @@ -1288,11 +1290,15 @@ def empty_gen(): assert table["parent"] == "parent" assert empty_r.table_name == "table" assert table["schema_contract"] == "freeze" + assert table["table_format"] == "delta" + assert table["file_format"] == "jsonl" # reset empty_r.apply_hints( table_name="", parent_table_name="", + table_format="", + file_format="", primary_key=[], merge_key="", columns={}, diff --git a/tests/libs/pyarrow/test_pyarrow_normalizer.py b/tests/libs/pyarrow/test_pyarrow_normalizer.py index 63abcbc92a..d975702ad8 100644 --- a/tests/libs/pyarrow/test_pyarrow_normalizer.py +++ b/tests/libs/pyarrow/test_pyarrow_normalizer.py @@ -3,8 +3,8 @@ import pyarrow as pa import pytest -from dlt.common.libs.pyarrow import normalize_py_arrow_item, NameNormalizationClash -from dlt.common.normalizers import explicit_normalizers, import_normalizers +from dlt.common.libs.pyarrow import normalize_py_arrow_item, NameNormalizationCollision +from dlt.common.normalizers.utils import explicit_normalizers, import_normalizers from dlt.common.schema.utils import new_column, TColumnSchema from dlt.common.destination import DestinationCapabilitiesContext @@ -65,7 +65,7 @@ def test_field_normalization_clash() -> None: {"col^New": "hello", "col_new": 1}, ] ) - with pytest.raises(NameNormalizationClash): + with pytest.raises(NameNormalizationCollision): _normalize(table, []) diff --git a/tests/load/athena_iceberg/test_athena_adapter.py b/tests/load/athena_iceberg/test_athena_adapter.py index 3144eb9cc9..19c176a374 100644 --- a/tests/load/athena_iceberg/test_athena_adapter.py +++ b/tests/load/athena_iceberg/test_athena_adapter.py @@ -2,7 +2,7 @@ import dlt from dlt.destinations import filesystem -from dlt.destinations.impl.athena.athena_adapter import athena_adapter, athena_partition +from dlt.destinations.adapters import athena_adapter, athena_partition # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -40,7 +40,7 @@ def not_partitioned_table(): "athena_test", destination="athena", staging=filesystem("s3://not-a-real-bucket"), - full_refresh=True, + dev_mode=True, ) pipeline.extract([partitioned_table, not_partitioned_table]) diff --git a/tests/load/athena_iceberg/test_athena_iceberg.py b/tests/load/athena_iceberg/test_athena_iceberg.py index 4fe01752ee..0ef935a8bc 100644 --- a/tests/load/athena_iceberg/test_athena_iceberg.py +++ b/tests/load/athena_iceberg/test_athena_iceberg.py @@ -1,15 +1,9 @@ import pytest import os -import datetime # noqa: I251 from typing import Iterator, Any import dlt -from dlt.common import pendulum -from dlt.common.utils import uniq_id -from tests.cases import table_update_and_row, assert_all_data_types_row -from tests.pipeline.utils import assert_load_info, load_table_counts - -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration +from tests.pipeline.utils import load_table_counts from dlt.destinations.exceptions import DatabaseTerminalException diff --git a/tests/load/bigquery/test_bigquery_client.py b/tests/load/bigquery/test_bigquery_client.py index b16790b07d..e8b5dab8fd 100644 --- a/tests/load/bigquery/test_bigquery_client.py +++ b/tests/load/bigquery/test_bigquery_client.py @@ -22,7 +22,7 @@ from dlt.destinations.impl.bigquery.bigquery import BigQueryClient, BigQueryClientConfiguration from dlt.destinations.exceptions import LoadJobNotExistsException, LoadJobTerminalException -from tests.utils import TEST_STORAGE_ROOT, delete_test_storage, preserve_environ +from tests.utils import TEST_STORAGE_ROOT, delete_test_storage from tests.common.utils import json_case_path as common_json_case_path from tests.common.configuration.utils import environment from tests.load.utils import ( diff --git a/tests/load/bigquery/test_bigquery_streaming_insert.py b/tests/load/bigquery/test_bigquery_streaming_insert.py index c80f6ed65a..c950a46f91 100644 --- a/tests/load/bigquery/test_bigquery_streaming_insert.py +++ b/tests/load/bigquery/test_bigquery_streaming_insert.py @@ -1,7 +1,7 @@ import pytest import dlt -from dlt.destinations.impl.bigquery.bigquery_adapter import bigquery_adapter +from dlt.destinations.adapters import bigquery_adapter from tests.pipeline.utils import assert_load_info @@ -12,7 +12,7 @@ def test_resource(): bigquery_adapter(test_resource, insert_api="streaming") - pipe = dlt.pipeline(pipeline_name="insert_test", destination="bigquery", full_refresh=True) + pipe = dlt.pipeline(pipeline_name="insert_test", destination="bigquery", dev_mode=True) pack = pipe.run(test_resource, table_name="test_streaming_items44") assert_load_info(pack) @@ -41,10 +41,12 @@ def test_resource(): pipe = dlt.pipeline(pipeline_name="insert_test", destination="bigquery") info = pipe.run(test_resource) + # pick the failed job + failed_job = info.load_packages[0].jobs["failed_jobs"][0] assert ( """BigQuery streaming insert can only be used with `append`""" """ write_disposition, while the given resource has `merge`.""" - ) in info.asdict()["load_packages"][0]["jobs"][0]["failed_message"] + ) in failed_job.failed_message def test_bigquery_streaming_nested_data(): @@ -54,7 +56,7 @@ def test_resource(): bigquery_adapter(test_resource, insert_api="streaming") - pipe = dlt.pipeline(pipeline_name="insert_test", destination="bigquery", full_refresh=True) + pipe = dlt.pipeline(pipeline_name="insert_test", destination="bigquery", dev_mode=True) pack = pipe.run(test_resource, table_name="test_streaming_items") assert_load_info(pack) diff --git a/tests/load/bigquery/test_bigquery_table_builder.py b/tests/load/bigquery/test_bigquery_table_builder.py index df564192dc..66ea4a319f 100644 --- a/tests/load/bigquery/test_bigquery_table_builder.py +++ b/tests/load/bigquery/test_bigquery_table_builder.py @@ -21,17 +21,23 @@ from dlt.common.schema import Schema from dlt.common.utils import custom_environ from dlt.common.utils import uniq_id + from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate +from dlt.destinations import bigquery from dlt.destinations.impl.bigquery.bigquery import BigQueryClient -from dlt.destinations.impl.bigquery.bigquery_adapter import bigquery_adapter +from dlt.destinations.adapters import bigquery_adapter from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration + from dlt.extract import DltResource -from tests.load.pipeline.utils import ( + +from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, drop_active_pipeline_data, + TABLE_UPDATE, + sequence_generator, + empty_schema, ) -from tests.load.utils import TABLE_UPDATE, sequence_generator, empty_schema # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -58,7 +64,7 @@ def gcp_client(empty_schema: Schema) -> BigQueryClient: creds = GcpServiceAccountCredentials() creds.project_id = "test_project_id" # noinspection PydanticTypeChecker - return BigQueryClient( + return bigquery().client( empty_schema, BigQueryClientConfiguration(credentials=creds)._bind_dataset_name( dataset_name=f"test_{uniq_id()}" @@ -89,9 +95,9 @@ def test_create_table(gcp_client: BigQueryClient) -> None: sqlfluff.parse(sql, dialect="bigquery") assert sql.startswith("CREATE TABLE") assert "event_test_table" in sql - assert "`col1` INTEGER NOT NULL" in sql + assert "`col1` INT64 NOT NULL" in sql assert "`col2` FLOAT64 NOT NULL" in sql - assert "`col3` BOOLEAN NOT NULL" in sql + assert "`col3` BOOL NOT NULL" in sql assert "`col4` TIMESTAMP NOT NULL" in sql assert "`col5` STRING " in sql assert "`col6` NUMERIC(38,9) NOT NULL" in sql @@ -100,7 +106,7 @@ def test_create_table(gcp_client: BigQueryClient) -> None: assert "`col9` JSON NOT NULL" in sql assert "`col10` DATE" in sql assert "`col11` TIME" in sql - assert "`col1_precision` INTEGER NOT NULL" in sql + assert "`col1_precision` INT64 NOT NULL" in sql assert "`col4_precision` TIMESTAMP NOT NULL" in sql assert "`col5_precision` STRING(25) " in sql assert "`col6_precision` NUMERIC(6,2) NOT NULL" in sql @@ -119,9 +125,9 @@ def test_alter_table(gcp_client: BigQueryClient) -> None: assert sql.startswith("ALTER TABLE") assert sql.count("ALTER TABLE") == 1 assert "event_test_table" in sql - assert "ADD COLUMN `col1` INTEGER NOT NULL" in sql + assert "ADD COLUMN `col1` INT64 NOT NULL" in sql assert "ADD COLUMN `col2` FLOAT64 NOT NULL" in sql - assert "ADD COLUMN `col3` BOOLEAN NOT NULL" in sql + assert "ADD COLUMN `col3` BOOL NOT NULL" in sql assert "ADD COLUMN `col4` TIMESTAMP NOT NULL" in sql assert "ADD COLUMN `col5` STRING" in sql assert "ADD COLUMN `col6` NUMERIC(38,9) NOT NULL" in sql @@ -130,7 +136,7 @@ def test_alter_table(gcp_client: BigQueryClient) -> None: assert "ADD COLUMN `col9` JSON NOT NULL" in sql assert "ADD COLUMN `col10` DATE" in sql assert "ADD COLUMN `col11` TIME" in sql - assert "ADD COLUMN `col1_precision` INTEGER NOT NULL" in sql + assert "ADD COLUMN `col1_precision` INT64 NOT NULL" in sql assert "ADD COLUMN `col4_precision` TIMESTAMP NOT NULL" in sql assert "ADD COLUMN `col5_precision` STRING(25)" in sql assert "ADD COLUMN `col6_precision` NUMERIC(6,2) NOT NULL" in sql @@ -946,7 +952,7 @@ def sources() -> List[DltResource]: pipeline = destination_config.setup_pipeline( f"bigquery_{uniq_id()}", - full_refresh=True, + dev_mode=True, ) pipeline.run(sources()) diff --git a/tests/load/cases/loading/csv_header.csv b/tests/load/cases/loading/csv_header.csv new file mode 100644 index 0000000000..14c7514e51 --- /dev/null +++ b/tests/load/cases/loading/csv_header.csv @@ -0,0 +1,3 @@ +id|name|description|ordered_at|price +1|item|value|2024-04-12|128.4 +1|"item"|value with space|2024-04-12|128.4 \ No newline at end of file diff --git a/tests/load/cases/loading/csv_no_header.csv b/tests/load/cases/loading/csv_no_header.csv new file mode 100644 index 0000000000..1e3a63494e --- /dev/null +++ b/tests/load/cases/loading/csv_no_header.csv @@ -0,0 +1,2 @@ +1|item|value|2024-04-12|128.4 +1|"item"|value with space|2024-04-12|128.4 \ No newline at end of file diff --git a/tests/load/cases/loading/csv_no_header.csv.gz b/tests/load/cases/loading/csv_no_header.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..310950f4840a3721a2206cabd29d8bface3f8286 GIT binary patch literal 90 zcmb2|=HO`BS(?VcoLpQMpO+t>k(!v2S`=@j2jnvtuQ{pXd3LR)_Zdy!wT4CpMi)&D uE}ELGGBnwA!T6!knm{H`ZO12~r;NRgM7p?zCL2y=VvwyDNcCV~U;qHmn;y#m literal 0 HcmV?d00001 diff --git a/tests/load/cases/loading/header.jsonl b/tests/load/cases/loading/header.jsonl new file mode 100644 index 0000000000..c2f9fee551 --- /dev/null +++ b/tests/load/cases/loading/header.jsonl @@ -0,0 +1,2 @@ +{"id": 1, "name": "item", "description": "value", "ordered_at": "2024-04-12", "price": 128.4} +{"id": 1, "name": "item", "description": "value with space", "ordered_at": "2024-04-12", "price": 128.4} \ No newline at end of file diff --git a/tests/load/clickhouse/test_clickhouse_adapter.py b/tests/load/clickhouse/test_clickhouse_adapter.py index 36d3ac07f7..ea3116c25b 100644 --- a/tests/load/clickhouse/test_clickhouse_adapter.py +++ b/tests/load/clickhouse/test_clickhouse_adapter.py @@ -19,7 +19,7 @@ def not_annotated_resource(): clickhouse_adapter(merge_tree_resource, table_engine_type="merge_tree") clickhouse_adapter(replicated_merge_tree_resource, table_engine_type="replicated_merge_tree") - pipe = dlt.pipeline(pipeline_name="adapter_test", destination="clickhouse", full_refresh=True) + pipe = dlt.pipeline(pipeline_name="adapter_test", destination="clickhouse", dev_mode=True) pack = pipe.run([merge_tree_resource, replicated_merge_tree_resource, not_annotated_resource]) assert_load_info(pack) diff --git a/tests/load/clickhouse/test_clickhouse_gcs_s3_compatibility.py b/tests/load/clickhouse/test_clickhouse_gcs_s3_compatibility.py index 481cd420c6..b2edb12d49 100644 --- a/tests/load/clickhouse/test_clickhouse_gcs_s3_compatibility.py +++ b/tests/load/clickhouse/test_clickhouse_gcs_s3_compatibility.py @@ -22,7 +22,7 @@ def dummy_data() -> Generator[Dict[str, int], None, None]: pipeline_name="gcs_s3_compatibility", destination="clickhouse", staging=gcp_bucket, - full_refresh=True, + dev_mode=True, ) pack = pipe.run([dummy_data]) assert_load_info(pack) diff --git a/tests/load/clickhouse/test_clickhouse_table_builder.py b/tests/load/clickhouse/test_clickhouse_table_builder.py index fd3bf50907..867102dde9 100644 --- a/tests/load/clickhouse/test_clickhouse_table_builder.py +++ b/tests/load/clickhouse/test_clickhouse_table_builder.py @@ -6,6 +6,8 @@ from dlt.common.schema import Schema from dlt.common.utils import custom_environ, digest128 from dlt.common.utils import uniq_id + +from dlt.destinations import clickhouse from dlt.destinations.impl.clickhouse.clickhouse import ClickHouseClient from dlt.destinations.impl.clickhouse.configuration import ( ClickHouseCredentials, @@ -18,7 +20,7 @@ def clickhouse_client(empty_schema: Schema) -> ClickHouseClient: # Return a client without opening connection. creds = ClickHouseCredentials() - return ClickHouseClient( + return clickhouse().client( empty_schema, ClickHouseClientConfiguration(credentials=creds)._bind_dataset_name(f"test_{uniq_id()}"), ) diff --git a/tests/load/conftest.py b/tests/load/conftest.py index fefaeee077..a110b1198f 100644 --- a/tests/load/conftest.py +++ b/tests/load/conftest.py @@ -2,8 +2,8 @@ import pytest from typing import Iterator -from tests.load.utils import ALL_BUCKETS, DEFAULT_BUCKETS, WITH_GDRIVE_BUCKETS -from tests.utils import preserve_environ +from tests.load.utils import ALL_BUCKETS, DEFAULT_BUCKETS, WITH_GDRIVE_BUCKETS, drop_pipeline +from tests.utils import preserve_environ, patch_home_dir @pytest.fixture(scope="function", params=DEFAULT_BUCKETS) diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index cc353f5894..f6a06180c9 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -6,7 +6,6 @@ from dlt.destinations.impl.databricks.configuration import DatabricksClientConfiguration from dlt.common.configuration import resolve_configuration -from tests.utils import preserve_environ # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/dremio/test_dremio_client.py b/tests/load/dremio/test_dremio_client.py index d0002dc343..efc72c0652 100644 --- a/tests/load/dremio/test_dremio_client.py +++ b/tests/load/dremio/test_dremio_client.py @@ -1,6 +1,8 @@ import pytest from dlt.common.schema import TColumnSchema, Schema + +from dlt.destinations import dremio from dlt.destinations.impl.dremio.configuration import DremioClientConfiguration, DremioCredentials from dlt.destinations.impl.dremio.dremio import DremioClient from tests.load.utils import empty_schema @@ -10,11 +12,11 @@ def dremio_client(empty_schema: Schema) -> DremioClient: creds = DremioCredentials() creds.database = "test_database" - return DremioClient( + # ignore any configured values + creds.resolve() + return dremio(credentials=creds).client( empty_schema, - DremioClientConfiguration(credentials=creds)._bind_dataset_name( - dataset_name="test_dataset" - ), + DremioClientConfiguration()._bind_dataset_name(dataset_name="test_dataset"), ) diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index 8f6bf195e2..ebbe959874 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -15,9 +15,8 @@ from dlt.destinations.impl.duckdb.exceptions import InvalidInMemoryDuckdbCredentials from dlt.pipeline.exceptions import PipelineStepFailed -from tests.load.pipeline.utils import drop_pipeline from tests.pipeline.utils import assert_table -from tests.utils import patch_home_dir, autouse_test_storage, preserve_environ, TEST_STORAGE_ROOT +from tests.utils import patch_home_dir, autouse_test_storage, TEST_STORAGE_ROOT # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -57,7 +56,7 @@ def test_duckdb_open_conn_default() -> None: delete_quack_db() -def test_duckdb_in_memory_mode_via_factory(preserve_environ): +def test_duckdb_in_memory_mode_via_factory(): delete_quack_db() try: import duckdb diff --git a/tests/load/duckdb/test_duckdb_table_builder.py b/tests/load/duckdb/test_duckdb_table_builder.py index 545f182ece..85f86ce84d 100644 --- a/tests/load/duckdb/test_duckdb_table_builder.py +++ b/tests/load/duckdb/test_duckdb_table_builder.py @@ -5,6 +5,7 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema +from dlt.destinations import duckdb from dlt.destinations.impl.duckdb.duck import DuckDbClient from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration @@ -22,7 +23,7 @@ @pytest.fixture def client(empty_schema: Schema) -> DuckDbClient: # return client without opening connection - return DuckDbClient( + return duckdb().client( empty_schema, DuckDbClientConfiguration()._bind_dataset_name(dataset_name="test_" + uniq_id()), ) @@ -117,7 +118,7 @@ def test_create_table_with_hints(client: DuckDbClient) -> None: assert '"col4" TIMESTAMP WITH TIME ZONE NOT NULL' in sql # same thing with indexes - client = DuckDbClient( + client = duckdb().client( client.schema, DuckDbClientConfiguration(create_indexes=True)._bind_dataset_name( dataset_name="test_" + uniq_id() diff --git a/tests/load/duckdb/test_motherduck_client.py b/tests/load/duckdb/test_motherduck_client.py index 2a1d703c87..764e1654c6 100644 --- a/tests/load/duckdb/test_motherduck_client.py +++ b/tests/load/duckdb/test_motherduck_client.py @@ -14,7 +14,7 @@ MotherDuckClientConfiguration, ) -from tests.utils import patch_home_dir, preserve_environ, skip_if_not_active +from tests.utils import patch_home_dir, skip_if_not_active # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/filesystem/test_aws_credentials.py b/tests/load/filesystem/test_aws_credentials.py index 1a41144744..5e0a3c3fd0 100644 --- a/tests/load/filesystem/test_aws_credentials.py +++ b/tests/load/filesystem/test_aws_credentials.py @@ -1,6 +1,7 @@ import pytest from typing import Dict +from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration from dlt.common.utils import digest128 from dlt.common.configuration import resolve_configuration from dlt.common.configuration.specs.aws_credentials import AwsCredentials @@ -8,7 +9,7 @@ from tests.common.configuration.utils import environment from tests.load.utils import ALL_FILESYSTEM_DRIVERS -from tests.utils import preserve_environ, autouse_test_storage +from tests.utils import autouse_test_storage # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -101,6 +102,11 @@ def test_aws_credentials_from_boto3(environment: Dict[str, str]) -> None: assert c.aws_access_key_id == "fake_access_key" +def test_aws_credentials_from_unknown_object() -> None: + with pytest.raises(InvalidBoto3Session): + AwsCredentials().parse_native_representation(CredentialsConfiguration()) + + def test_aws_credentials_for_profile(environment: Dict[str, str]) -> None: import botocore.exceptions diff --git a/tests/load/filesystem/test_azure_credentials.py b/tests/load/filesystem/test_azure_credentials.py index 4ee2ec46db..2353491737 100644 --- a/tests/load/filesystem/test_azure_credentials.py +++ b/tests/load/filesystem/test_azure_credentials.py @@ -17,7 +17,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AZ_BUCKET from tests.common.configuration.utils import environment -from tests.utils import preserve_environ, autouse_test_storage +from tests.utils import autouse_test_storage from dlt.common.storages.fsspec_filesystem import fsspec_from_config # mark all tests as essential, do not remove diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index fbfd08271b..597d400344 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -2,13 +2,22 @@ import os from unittest import mock from pathlib import Path +from urllib.parse import urlparse import pytest +from dlt.common.configuration.specs.azure_credentials import AzureCredentials +from dlt.common.configuration.specs.base_configuration import ( + CredentialsConfiguration, + extract_inner_hint, +) +from dlt.common.schema.schema import Schema +from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.time import ensure_pendulum_datetime from dlt.common.utils import digest128, uniq_id from dlt.common.storages import FileStorage, ParsedLoadJobFileName +from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import ( FilesystemDestinationClientConfiguration, INIT_FILE_NAME, @@ -46,6 +55,32 @@ def test_filesystem_destination_configuration() -> None: ).fingerprint() == digest128("s3://cool") +def test_filesystem_factory_buckets(with_gdrive_buckets_env: str) -> None: + proto = urlparse(with_gdrive_buckets_env).scheme + credentials_type = extract_inner_hint( + FilesystemConfiguration.PROTOCOL_CREDENTIALS.get(proto, CredentialsConfiguration) + ) + + # test factory figuring out the right credentials + filesystem_ = filesystem(with_gdrive_buckets_env) + client = filesystem_.client( + Schema("test"), + initial_config=FilesystemDestinationClientConfiguration()._bind_dataset_name("test"), + ) + assert client.config.protocol == proto or "file" + assert isinstance(client.config.credentials, credentials_type) + assert issubclass(client.config.credentials_type(client.config), credentials_type) + assert filesystem_.capabilities() + + # factory gets initial credentials + filesystem_ = filesystem(with_gdrive_buckets_env, credentials=credentials_type()) + client = filesystem_.client( + Schema("test"), + initial_config=FilesystemDestinationClientConfiguration()._bind_dataset_name("test"), + ) + assert isinstance(client.config.credentials, credentials_type) + + @pytest.mark.parametrize("write_disposition", ("replace", "append", "merge")) @pytest.mark.parametrize("layout", TEST_FILE_LAYOUTS) def test_successful_load(write_disposition: str, layout: str, with_gdrive_buckets_env: str) -> None: diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index 270e1ff70c..a7b1371f9f 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -20,9 +20,10 @@ from dlt.destinations.impl.filesystem.configuration import ( FilesystemDestinationClientConfiguration, ) +from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders from tests.common.storages.utils import assert_sample_files from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET -from tests.utils import preserve_environ, autouse_test_storage +from tests.utils import autouse_test_storage from .utils import self_signed_cert from tests.common.configuration.utils import environment @@ -199,7 +200,7 @@ def test_s3_wrong_client_certificate(default_buckets_env: str, self_signed_cert: def test_filesystem_destination_config_reports_unused_placeholders(mocker) -> None: with custom_environ({"DATASET_NAME": "BOBO"}): - extra_placeholders = { + extra_placeholders: TExtraPlaceholders = { "value": 1, "otters": "lab", "dlt": "labs", @@ -211,7 +212,7 @@ def test_filesystem_destination_config_reports_unused_placeholders(mocker) -> No FilesystemDestinationClientConfiguration( bucket_url="file:///tmp/dirbobo", layout="{schema_name}/{table_name}/{otters}-x-{x}/{load_id}.{file_id}.{timestamp}.{ext}", - extra_placeholders=extra_placeholders, # type: ignore + extra_placeholders=extra_placeholders, ) ) logger_spy.assert_called_once_with("Found unused layout placeholders: value, dlt, dlthub") @@ -227,7 +228,7 @@ def test_filesystem_destination_passed_parameters_override_config_values() -> No "DESTINATION__FILESYSTEM__EXTRA_PLACEHOLDERS": json.dumps(config_extra_placeholders), } ): - extra_placeholders = { + extra_placeholders: TExtraPlaceholders = { "new_value": 1, "dlt": "labs", "dlthub": "platform", diff --git a/tests/load/filesystem/test_object_store_rs_credentials.py b/tests/load/filesystem/test_object_store_rs_credentials.py index 4e43b7c5d8..524cd4425d 100644 --- a/tests/load/filesystem/test_object_store_rs_credentials.py +++ b/tests/load/filesystem/test_object_store_rs_credentials.py @@ -29,9 +29,11 @@ FS_CREDS: Dict[str, Any] = dlt.secrets.get("destination.filesystem.credentials") -assert ( - FS_CREDS is not None -), "`destination.filesystem.credentials` must be configured for these tests." +if FS_CREDS is None: + pytest.skip( + msg="`destination.filesystem.credentials` must be configured for these tests.", + allow_module_level=True, + ) def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> bool: @@ -86,6 +88,7 @@ def test_aws_object_store_rs_credentials() -> None: creds = AwsCredentials( aws_access_key_id=FS_CREDS["aws_access_key_id"], aws_secret_access_key=FS_CREDS["aws_secret_access_key"], + # region_name must be configured in order for data lake to work region_name=FS_CREDS["region_name"], ) assert creds.aws_session_token is None @@ -138,6 +141,7 @@ def test_gcp_object_store_rs_credentials() -> None: creds = GcpServiceAccountCredentialsWithoutDefaults( project_id=FS_CREDS["project_id"], private_key=FS_CREDS["private_key"], + # private_key_id must be configured in order for data lake to work private_key_id=FS_CREDS["private_key_id"], client_email=FS_CREDS["client_email"], ) diff --git a/tests/load/mssql/test_mssql_credentials.py b/tests/load/mssql/test_mssql_configuration.py similarity index 77% rename from tests/load/mssql/test_mssql_credentials.py rename to tests/load/mssql/test_mssql_configuration.py index 7d49196531..75af101e23 100644 --- a/tests/load/mssql/test_mssql_credentials.py +++ b/tests/load/mssql/test_mssql_configuration.py @@ -1,15 +1,46 @@ +import os import pyodbc import pytest from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException from dlt.common.exceptions import SystemConfigurationException +from dlt.common.schema import Schema -from dlt.destinations.impl.mssql.configuration import MsSqlCredentials +from dlt.destinations import mssql +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials, MsSqlClientConfiguration # mark all tests as essential, do not remove pytestmark = pytest.mark.essential +def test_mssql_factory() -> None: + schema = Schema("schema") + dest = mssql() + client = dest.client(schema, MsSqlClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.create_indexes is False + assert client.config.has_case_sensitive_identifiers is False + assert client.capabilities.has_case_sensitive_identifiers is False + assert client.capabilities.casefold_identifier is str + + # set args explicitly + dest = mssql(has_case_sensitive_identifiers=True, create_indexes=True) + client = dest.client(schema, MsSqlClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.create_indexes is True + assert client.config.has_case_sensitive_identifiers is True + assert client.capabilities.has_case_sensitive_identifiers is True + assert client.capabilities.casefold_identifier is str + + # set args via config + os.environ["DESTINATION__CREATE_INDEXES"] = "True" + os.environ["DESTINATION__HAS_CASE_SENSITIVE_IDENTIFIERS"] = "True" + dest = mssql() + client = dest.client(schema, MsSqlClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.create_indexes is True + assert client.config.has_case_sensitive_identifiers is True + assert client.capabilities.has_case_sensitive_identifiers is True + assert client.capabilities.casefold_identifier is str + + def test_mssql_credentials_defaults() -> None: creds = MsSqlCredentials() assert creds.port == 1433 diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index f7a87c14ee..d6cf3ec3e8 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -6,7 +6,8 @@ pytest.importorskip("dlt.destinations.impl.mssql.mssql", reason="MSSQL ODBC driver not installed") -from dlt.destinations.impl.mssql.mssql import MsSqlClient +from dlt.destinations import mssql +from dlt.destinations.impl.mssql.mssql import MsSqlJobClient from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration, MsSqlCredentials from tests.load.utils import TABLE_UPDATE, empty_schema @@ -16,9 +17,9 @@ @pytest.fixture -def client(empty_schema: Schema) -> MsSqlClient: +def client(empty_schema: Schema) -> MsSqlJobClient: # return client without opening connection - return MsSqlClient( + return mssql().client( empty_schema, MsSqlClientConfiguration(credentials=MsSqlCredentials())._bind_dataset_name( dataset_name="test_" + uniq_id() @@ -26,7 +27,7 @@ def client(empty_schema: Schema) -> MsSqlClient: ) -def test_create_table(client: MsSqlClient) -> None: +def test_create_table(client: MsSqlJobClient) -> None: # non existing table sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)[0] sqlfluff.parse(sql, dialect="tsql") @@ -50,7 +51,7 @@ def test_create_table(client: MsSqlClient) -> None: assert '"col11_precision" time(3) NOT NULL' in sql -def test_alter_table(client: MsSqlClient) -> None: +def test_alter_table(client: MsSqlJobClient) -> None: # existing table has no columns sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, True)[0] sqlfluff.parse(sql, dialect="tsql") diff --git a/tests/load/pipeline/conftest.py b/tests/load/pipeline/conftest.py index 34227a8041..a2ba65494b 100644 --- a/tests/load/pipeline/conftest.py +++ b/tests/load/pipeline/conftest.py @@ -1,8 +1,2 @@ -from tests.utils import ( - patch_home_dir, - preserve_environ, - autouse_test_storage, - duckdb_pipeline_location, -) +from tests.utils import autouse_test_storage, duckdb_pipeline_location from tests.pipeline.utils import drop_dataset_from_env -from tests.load.pipeline.utils import drop_pipeline diff --git a/tests/load/pipeline/test_arrow_loading.py b/tests/load/pipeline/test_arrow_loading.py index 0bddfaabee..630d84a28c 100644 --- a/tests/load/pipeline/test_arrow_loading.py +++ b/tests/load/pipeline/test_arrow_loading.py @@ -9,14 +9,14 @@ import dlt from dlt.common import pendulum -from dlt.common.time import reduce_pendulum_datetime_precision, ensure_pendulum_datetime +from dlt.common.time import reduce_pendulum_datetime_precision from dlt.common.utils import uniq_id + from tests.load.utils import destinations_configs, DestinationTestConfiguration from tests.pipeline.utils import assert_load_info, select_data from tests.utils import ( TestDataItemFormat, arrow_item_from_pandas, - preserve_environ, TPythonTableFormat, ) from tests.cases import arrow_table_all_data_types diff --git a/tests/load/pipeline/test_athena.py b/tests/load/pipeline/test_athena.py index 272cc701d5..3197a19d14 100644 --- a/tests/load/pipeline/test_athena.py +++ b/tests/load/pipeline/test_athena.py @@ -9,15 +9,15 @@ from tests.pipeline.utils import assert_load_info, load_table_counts from tests.pipeline.utils import load_table_counts from dlt.destinations.exceptions import CantExtractTablePrefix -from dlt.destinations.impl.athena.athena_adapter import athena_partition, athena_adapter -from dlt.destinations.fs_client import FSClientBase +from dlt.destinations.adapters import athena_partition, athena_adapter -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration from tests.load.utils import ( TEST_FILE_LAYOUTS, FILE_LAYOUT_MANY_TABLES_ONE_FOLDER, FILE_LAYOUT_CLASSIC, FILE_LAYOUT_TABLE_NOT_FIRST, + destinations_configs, + DestinationTestConfiguration, ) # mark all tests as essential, do not remove @@ -208,7 +208,7 @@ def my_source() -> Any: @pytest.mark.parametrize("layout", TEST_FILE_LAYOUTS) def test_athena_file_layouts(destination_config: DestinationTestConfiguration, layout) -> None: # test wether strange file layouts still work in all staging configs - pipeline = destination_config.setup_pipeline("athena_file_layout", full_refresh=True) + pipeline = destination_config.setup_pipeline("athena_file_layout", dev_mode=True) os.environ["DESTINATION__FILESYSTEM__LAYOUT"] = layout resources = [ @@ -242,7 +242,7 @@ def test_athena_file_layouts(destination_config: DestinationTestConfiguration, l ) def test_athena_partitioned_iceberg_table(destination_config: DestinationTestConfiguration): """Load an iceberg table with partition hints and verifiy partitions are created correctly.""" - pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), dev_mode=True) data_items = [ (1, "A", datetime.date.fromisoformat("2021-01-01")), diff --git a/tests/load/pipeline/test_bigquery.py b/tests/load/pipeline/test_bigquery.py index 68533a5d43..0618ff9d3d 100644 --- a/tests/load/pipeline/test_bigquery.py +++ b/tests/load/pipeline/test_bigquery.py @@ -3,8 +3,7 @@ from dlt.common import Decimal from tests.pipeline.utils import assert_load_info -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -from tests.load.utils import delete_dataset +from tests.load.utils import destinations_configs, DestinationTestConfiguration # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/pipeline/test_clickhouse.py b/tests/load/pipeline/test_clickhouse.py index 2ba5cfdcb8..8ad3a7f1a7 100644 --- a/tests/load/pipeline/test_clickhouse.py +++ b/tests/load/pipeline/test_clickhouse.py @@ -5,10 +5,7 @@ import dlt from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id -from tests.load.pipeline.utils import ( - destinations_configs, - DestinationTestConfiguration, -) +from tests.load.utils import destinations_configs, DestinationTestConfiguration from tests.pipeline.utils import load_table_counts @@ -18,7 +15,7 @@ ids=lambda x: x.name, ) def test_clickhouse_destination_append(destination_config: DestinationTestConfiguration) -> None: - pipeline = destination_config.setup_pipeline(f"clickhouse_{uniq_id()}", full_refresh=True) + pipeline = destination_config.setup_pipeline(f"clickhouse_{uniq_id()}", dev_mode=True) try: diff --git a/tests/load/pipeline/test_csv_loading.py b/tests/load/pipeline/test_csv_loading.py new file mode 100644 index 0000000000..6a2be2eb40 --- /dev/null +++ b/tests/load/pipeline/test_csv_loading.py @@ -0,0 +1,172 @@ +import os +from typing import List +import pytest + +import dlt +from dlt.common.data_writers.configuration import CsvFormatConfiguration +from dlt.common.schema.typing import TColumnSchema +from dlt.common.typing import TLoaderFileFormat +from dlt.common.utils import uniq_id + +from tests.cases import arrow_table_all_data_types, prepare_shuffled_tables +from tests.pipeline.utils import ( + assert_data_table_counts, + assert_load_info, + assert_only_table_columns, + load_tables_to_dicts, +) +from tests.load.utils import destinations_configs, DestinationTestConfiguration +from tests.utils import TestDataItemFormat + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres", "snowflake"]), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("item_type", ["object", "table"]) +def test_load_csv( + destination_config: DestinationTestConfiguration, item_type: TestDataItemFormat +) -> None: + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" + pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), dev_mode=True) + # do not save state so the state job is not created + pipeline.config.restore_from_destination = False + + table, shuffled_table, shuffled_removed_column = prepare_shuffled_tables() + # convert to pylist when loading from objects, this will kick the csv-reader in + if item_type == "object": + table, shuffled_table, shuffled_removed_column = ( + table.to_pylist(), + shuffled_table.to_pylist(), + shuffled_removed_column.to_pylist(), + ) + + load_info = pipeline.run( + [shuffled_removed_column, shuffled_table, table], + table_name="table", + loader_file_format="csv", + ) + assert_load_info(load_info) + job = load_info.load_packages[0].jobs["completed_jobs"][0].file_path + assert job.endswith("csv") + assert_data_table_counts(pipeline, {"table": 5432 * 3}) + load_tables_to_dicts(pipeline, "table") + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres", "snowflake"]), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("file_format", (None, "csv")) +@pytest.mark.parametrize("compression", (True, False)) +def test_custom_csv_no_header( + destination_config: DestinationTestConfiguration, + file_format: TLoaderFileFormat, + compression: bool, +) -> None: + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = str(not compression) + csv_format = CsvFormatConfiguration(delimiter="|", include_header=False) + # apply to collected config + pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), dev_mode=True) + # this will apply this to config when client instance is created + pipeline.destination.config_params["csv_format"] = csv_format + # verify + assert pipeline.destination_client().config.csv_format == csv_format # type: ignore[attr-defined] + # create a resource that imports file + + columns: List[TColumnSchema] = [ + {"name": "id", "data_type": "bigint"}, + {"name": "name", "data_type": "text"}, + {"name": "description", "data_type": "text"}, + {"name": "ordered_at", "data_type": "date"}, + {"name": "price", "data_type": "decimal"}, + ] + hints = dlt.mark.make_hints(columns=columns) + import_file = "tests/load/cases/loading/csv_no_header.csv" + if compression: + import_file += ".gz" + info = pipeline.run( + [dlt.mark.with_file_import(import_file, "csv", 2, hints=hints)], + table_name="no_header", + loader_file_format=file_format, + ) + info.raise_on_failed_jobs() + print(info) + assert_only_table_columns(pipeline, "no_header", [col["name"] for col in columns]) + rows = load_tables_to_dicts(pipeline, "no_header") + assert len(rows["no_header"]) == 2 + # we should have twp files loaded + jobs = info.load_packages[0].jobs["completed_jobs"] + assert len(jobs) == 2 + job_extensions = [os.path.splitext(job.job_file_info.file_name())[1] for job in jobs] + assert ".csv" in job_extensions + # we allow state to be saved to make sure it is not in csv format (which would broke) + # the loading. state is always saved in destination preferred format + preferred_ext = "." + pipeline.destination.capabilities().preferred_loader_file_format + assert preferred_ext in job_extensions + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres", "snowflake"]), + ids=lambda x: x.name, +) +def test_custom_wrong_header(destination_config: DestinationTestConfiguration) -> None: + csv_format = CsvFormatConfiguration(delimiter="|", include_header=True) + # apply to collected config + pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), dev_mode=True) + # this will apply this to config when client instance is created + pipeline.destination.config_params["csv_format"] = csv_format + # verify + assert pipeline.destination_client().config.csv_format == csv_format # type: ignore[attr-defined] + # create a resource that imports file + + columns: List[TColumnSchema] = [ + {"name": "object_id", "data_type": "bigint", "nullable": False}, + {"name": "name", "data_type": "text"}, + {"name": "description", "data_type": "text"}, + {"name": "ordered_at", "data_type": "date"}, + {"name": "price", "data_type": "decimal"}, + ] + hints = dlt.mark.make_hints(columns=columns) + import_file = "tests/load/cases/loading/csv_header.csv" + # snowflake will pass here because we do not match + info = pipeline.run( + [dlt.mark.with_file_import(import_file, "csv", 2, hints=hints)], + table_name="no_header", + ) + assert info.has_failed_jobs + assert len(info.load_packages[0].jobs["failed_jobs"]) == 1 + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres", "snowflake"]), + ids=lambda x: x.name, +) +def test_empty_csv_from_arrow(destination_config: DestinationTestConfiguration) -> None: + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" + os.environ["RESTORE_FROM_DESTINATION"] = "False" + pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), dev_mode=True) + table, _, _ = arrow_table_all_data_types("arrow-table", include_json=False) + + load_info = pipeline.run( + table.schema.empty_table(), table_name="arrow_table", loader_file_format="csv" + ) + assert_load_info(load_info) + assert len(load_info.load_packages[0].jobs["completed_jobs"]) == 1 + job = load_info.load_packages[0].jobs["completed_jobs"][0].file_path + assert job.endswith("csv") + assert_data_table_counts(pipeline, {"arrow_table": 0}) + with pipeline.sql_client() as client: + with client.execute_query("SELECT * FROM arrow_table") as cur: + columns = [col.name for col in cur.description] + assert len(cur.fetchall()) == 0 + + # all columns in order, also casefold to the destination casing (we use cursor.description) + casefold = pipeline.destination.capabilities().casefold_identifier + assert columns == list( + map(casefold, pipeline.default_schema.get_table_columns("arrow_table").keys()) + ) diff --git a/tests/load/pipeline/test_dbt_helper.py b/tests/load/pipeline/test_dbt_helper.py index 1dc225594f..86ee1a646e 100644 --- a/tests/load/pipeline/test_dbt_helper.py +++ b/tests/load/pipeline/test_dbt_helper.py @@ -11,8 +11,8 @@ from dlt.helpers.dbt.exceptions import DBTProcessingError, PrerequisitesException from tests.pipeline.utils import select_data +from tests.load.utils import destinations_configs, DestinationTestConfiguration from tests.utils import ACTIVE_SQL_DESTINATIONS -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration # uncomment add motherduck tests # NOTE: the tests are passing but we disable them due to frequent ATTACH DATABASE timeouts diff --git a/tests/load/pipeline/test_dremio.py b/tests/load/pipeline/test_dremio.py index 9a4c96c922..66d1b0be4f 100644 --- a/tests/load/pipeline/test_dremio.py +++ b/tests/load/pipeline/test_dremio.py @@ -12,9 +12,7 @@ ids=lambda x: x.name, ) def test_dremio(destination_config: DestinationTestConfiguration) -> None: - pipeline = destination_config.setup_pipeline( - "dremio-test", dataset_name="bar", full_refresh=True - ) + pipeline = destination_config.setup_pipeline("dremio-test", dataset_name="bar", dev_mode=True) @dlt.resource(name="items", write_disposition="replace") def items() -> Iterator[Any]: diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 313ba63a2c..e1c6ec9d79 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -17,11 +17,11 @@ ) from dlt.destinations.job_client_impl import SqlJobClientBase -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration +from tests.load.utils import destinations_configs, DestinationTestConfiguration def _attach(pipeline: Pipeline) -> Pipeline: - return dlt.attach(pipeline.pipeline_name, pipeline.pipelines_dir) + return dlt.attach(pipeline.pipeline_name, pipelines_dir=pipeline.pipelines_dir) @dlt.source(section="droppable", name="droppable") @@ -91,13 +91,14 @@ def assert_dropped_resource_tables(pipeline: Pipeline, resources: List[str]) -> client: SqlJobClientBase with pipeline.destination_client(pipeline.default_schema_name) as client: # type: ignore[assignment] # Check all tables supposed to be dropped are not in dataset - for table in dropped_tables: - exists, _ = client.get_storage_table(table) - assert not exists + storage_tables = list(client.get_storage_tables(dropped_tables)) + # no columns in all tables + assert all(len(table[1]) == 0 for table in storage_tables) + # Check tables not from dropped resources still exist - for table in expected_tables: - exists, _ = client.get_storage_table(table) - assert exists + storage_tables = list(client.get_storage_tables(expected_tables)) + # all tables have columns + assert all(len(table[1]) > 0 for table in storage_tables) def assert_dropped_resource_states(pipeline: Pipeline, resources: List[str]) -> None: @@ -178,7 +179,7 @@ def test_drop_command_only_state(destination_config: DestinationTestConfiguratio def test_drop_command_only_tables(destination_config: DestinationTestConfiguration) -> None: """Test drop only tables and makes sure that schema and state are synced""" source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) pipeline.run(source) sources_state = pipeline.state["sources"] @@ -334,9 +335,8 @@ def test_drop_all_flag(destination_config: DestinationTestConfiguration) -> None # Verify original _dlt tables were not deleted with attached._sql_job_client(attached.default_schema) as client: - for tbl in dlt_tables: - exists, _ = client.get_storage_table(tbl) - assert exists + storage_tables = list(client.get_storage_tables(dlt_tables)) + assert all(len(table[1]) > 0 for table in storage_tables) @pytest.mark.parametrize( diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py index 3f9821cee0..3dcfffe348 100644 --- a/tests/load/pipeline/test_duckdb.py +++ b/tests/load/pipeline/test_duckdb.py @@ -1,16 +1,14 @@ import pytest import os +from dlt.common.schema.exceptions import SchemaIdentifierNormalizationCollision from dlt.common.time import ensure_pendulum_datetime from dlt.destinations.exceptions import DatabaseTerminalException from dlt.pipeline.exceptions import PipelineStepFailed from tests.cases import TABLE_UPDATE_ALL_INT_PRECISIONS, TABLE_UPDATE_ALL_TIMESTAMP_PRECISIONS +from tests.load.utils import destinations_configs, DestinationTestConfiguration from tests.pipeline.utils import airtable_emojis, load_table_counts -from tests.load.pipeline.utils import ( - destinations_configs, - DestinationTestConfiguration, -) @pytest.mark.parametrize( @@ -44,7 +42,7 @@ def test_duck_case_names(destination_config: DestinationTestConfiguration) -> No "🦚Peacock__peacock": 3, "🦚Peacocks🦚": 1, "🦚WidePeacock": 1, - "🦚WidePeacock__peacock": 3, + "🦚WidePeacock__Peacock": 3, } # this will fail - duckdb preserves case but is case insensitive when comparing identifiers @@ -54,7 +52,10 @@ def test_duck_case_names(destination_config: DestinationTestConfiguration) -> No table_name="🦚peacocks🦚", loader_file_format=destination_config.file_format, ) - assert isinstance(pip_ex.value.__context__, DatabaseTerminalException) + assert isinstance(pip_ex.value.__context__, SchemaIdentifierNormalizationCollision) + assert pip_ex.value.__context__.conflict_identifier_name == "🦚Peacocks🦚" + assert pip_ex.value.__context__.identifier_name == "🦚peacocks🦚" + assert pip_ex.value.__context__.identifier_type == "table" # show tables and columns with pipeline.sql_client() as client: diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index efbdc082f1..210ad76b8a 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -14,13 +14,14 @@ from dlt.common.utils import uniq_id from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient +from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders from dlt.pipeline.exceptions import PipelineStepFailed from tests.cases import arrow_table_all_data_types, table_update_and_row, assert_all_data_types_row from tests.common.utils import load_json_case from tests.utils import ALL_TEST_DATA_ITEM_FORMATS, TestDataItemFormat, skip_if_not_active from dlt.destinations.path_utils import create_path -from tests.load.pipeline.utils import ( +from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, ) @@ -34,7 +35,7 @@ @pytest.fixture def local_filesystem_pipeline() -> dlt.Pipeline: os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "_storage" - return dlt.pipeline(pipeline_name="fs_pipe", destination="filesystem", full_refresh=True) + return dlt.pipeline(pipeline_name="fs_pipe", destination="filesystem", dev_mode=True) def test_pipeline_merge_write_disposition(default_buckets_env: str) -> None: @@ -499,7 +500,7 @@ def count(*args, **kwargs) -> Any: return count - extra_placeholders = { + extra_placeholders: TExtraPlaceholders = { "who": "marcin", "action": "says", "what": "no potato", @@ -653,8 +654,8 @@ def some_data(): # test accessors for state s1 = c1.get_stored_state("p1") s2 = c1.get_stored_state("p2") - assert s1.dlt_load_id == load_id_1_2 # second load - assert s2.dlt_load_id == load_id_2_1 # first load + assert s1._dlt_load_id == load_id_1_2 # second load + assert s2._dlt_load_id == load_id_2_1 # first load assert s1_old.version != s1.version assert s2_old.version == s2.version @@ -797,13 +798,15 @@ def table_3(): # check opening of file values = [] - for line in fs_client.read_text(t1_files[0]).split("\n"): + for line in fs_client.read_text(t1_files[0], encoding="utf-8").split("\n"): if line: values.append(json.loads(line)["value"]) assert values == [1, 2, 3, 4, 5] # check binary read - assert fs_client.read_bytes(t1_files[0]) == str.encode(fs_client.read_text(t1_files[0])) + assert fs_client.read_bytes(t1_files[0]) == str.encode( + fs_client.read_text(t1_files[0], encoding="utf-8") + ) # check truncate fs_client.truncate_tables(["table_1"]) diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index a3f5083ae6..2c1d1346f1 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -19,7 +19,11 @@ from dlt.pipeline.exceptions import PipelineStepFailed from tests.pipeline.utils import assert_load_info, load_table_counts, select_data -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration +from tests.load.utils import ( + normalize_storage_table_cols, + destinations_configs, + DestinationTestConfiguration, +) # uncomment add motherduck tests # NOTE: the tests are passing but we disable them due to frequent ATTACH DATABASE timeouts @@ -38,7 +42,7 @@ def test_merge_on_keys_in_schema(destination_config: DestinationTestConfiguratio # make block uncles unseen to trigger filtering loader in loader for child tables if has_table_seen_data(schema.tables["blocks__uncles"]): - del schema.tables["blocks__uncles"]["x-normalizer"] # type: ignore[typeddict-item] + del schema.tables["blocks__uncles"]["x-normalizer"] assert not has_table_seen_data(schema.tables["blocks__uncles"]) with open( @@ -307,9 +311,10 @@ def test_merge_keys_non_existing_columns(destination_config: DestinationTestConf github_2_counts = load_table_counts(p, *[t["name"] for t in p.default_schema.data_tables()]) assert github_2_counts["issues"] == 100 - 45 + 1 with p._sql_job_client(p.default_schema) as job_c: - _, table_schema = job_c.get_storage_table("issues") - assert "url" in table_schema - assert "m_a1" not in table_schema # unbound columns were not created + _, storage_cols = job_c.get_storage_table("issues") + storage_cols = normalize_storage_table_cols("issues", storage_cols, p.default_schema) + assert "url" in storage_cols + assert "m_a1" not in storage_cols # unbound columns were not created @pytest.mark.parametrize( @@ -319,6 +324,8 @@ def test_merge_keys_non_existing_columns(destination_config: DestinationTestConf ) def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("github_3", dev_mode=True) + # do not save state to destination so jobs counting is easier + p.config.restore_from_destination = False github_data = github() # generate some complex types github_data.max_table_nesting = 2 @@ -985,7 +992,7 @@ def test_invalid_merge_strategy(destination_config: DestinationTestConfiguration def r(): yield {"foo": "bar"} - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) with pytest.raises(PipelineStepFailed) as pip_ex: p.run(r()) assert isinstance(pip_ex.value.__context__, SchemaException) diff --git a/tests/load/test_parallelism.py b/tests/load/pipeline/test_parallelism.py similarity index 98% rename from tests/load/test_parallelism.py rename to tests/load/pipeline/test_parallelism.py index a1a09a4d6b..656357fb00 100644 --- a/tests/load/test_parallelism.py +++ b/tests/load/pipeline/test_parallelism.py @@ -55,7 +55,7 @@ def t() -> TDataItems: yield {"num": i} # we load n items for 3 tables in one run - p = dlt.pipeline("sink_test", destination=test_sink, full_refresh=True) + p = dlt.pipeline("sink_test", destination=test_sink, dev_mode=True) p.run( [ dlt.resource(table_name="t1")(t), diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index ad44cd6f5c..a12c29168f 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -13,7 +13,8 @@ from dlt.common.destination.reference import WithStagingDataset from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import VERSION_TABLE_NAME +from dlt.common.schema.typing import PIPELINE_STATE_TABLE_NAME, VERSION_TABLE_NAME +from dlt.common.schema.utils import pipeline_state_table from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id @@ -26,7 +27,7 @@ PipelineStepFailed, ) -from tests.utils import TEST_STORAGE_ROOT, data_to_item_format, preserve_environ +from tests.utils import TEST_STORAGE_ROOT, data_to_item_format from tests.pipeline.utils import ( assert_data_table_counts, assert_load_info, @@ -40,12 +41,11 @@ TABLE_UPDATE_COLUMNS_SCHEMA, assert_all_data_types_row, delete_dataset, -) -from tests.load.pipeline.utils import ( drop_active_pipeline_data, - REPLACE_STRATEGIES, + destinations_configs, + DestinationTestConfiguration, ) -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration +from tests.load.pipeline.utils import REPLACE_STRATEGIES # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -137,10 +137,27 @@ def data_fun() -> Iterator[Any]: destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), ids=lambda x: x.name, ) -def test_default_schema_name(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize("use_single_dataset", [True, False]) +@pytest.mark.parametrize( + "naming_convention", + [ + "duck_case", + "snake_case", + "sql_cs_v1", + ], +) +def test_default_schema_name( + destination_config: DestinationTestConfiguration, + use_single_dataset: bool, + naming_convention: str, +) -> None: + os.environ["SCHEMA__NAMING"] = naming_convention destination_config.setup() dataset_name = "dataset_" + uniq_id() - data = ["a", "b", "c"] + data = [ + {"id": idx, "CamelInfo": uniq_id(), "GEN_ERIC": alpha} + for idx, alpha in [(0, "A"), (0, "B"), (0, "C")] + ] p = dlt.pipeline( "test_default_schema_name", @@ -149,16 +166,25 @@ def test_default_schema_name(destination_config: DestinationTestConfiguration) - staging=destination_config.staging, dataset_name=dataset_name, ) + p.config.use_single_dataset = use_single_dataset p.extract(data, table_name="test", schema=Schema("default")) p.normalize() info = p.load() + print(info) # try to restore pipeline r_p = dlt.attach("test_default_schema_name", TEST_STORAGE_ROOT) schema = r_p.default_schema assert schema.name == "default" - assert_table(p, "test", data, info=info) + # check if dlt ables have exactly the required schemas + # TODO: uncomment to check dlt tables schemas + # assert ( + # r_p.default_schema.tables[PIPELINE_STATE_TABLE_NAME]["columns"] + # == pipeline_state_table()["columns"] + # ) + + # assert_table(p, "test", data, info=info) @pytest.mark.parametrize( @@ -947,8 +973,7 @@ def table_3(make_data=False): load_table_counts(pipeline, "table_3") assert "x-normalizer" not in pipeline.default_schema.tables["table_3"] assert ( - pipeline.default_schema.tables["_dlt_pipeline_state"]["x-normalizer"]["seen-data"] # type: ignore[typeddict-item] - is True + pipeline.default_schema.tables["_dlt_pipeline_state"]["x-normalizer"]["seen-data"] is True ) # load with one empty job, table 3 not created @@ -990,18 +1015,9 @@ def table_3(make_data=False): # print(v5) # check if seen data is market correctly - assert ( - pipeline.default_schema.tables["table_3"]["x-normalizer"]["seen-data"] # type: ignore[typeddict-item] - is True - ) - assert ( - pipeline.default_schema.tables["table_2"]["x-normalizer"]["seen-data"] # type: ignore[typeddict-item] - is True - ) - assert ( - pipeline.default_schema.tables["table_1"]["x-normalizer"]["seen-data"] # type: ignore[typeddict-item] - is True - ) + assert pipeline.default_schema.tables["table_3"]["x-normalizer"]["seen-data"] is True + assert pipeline.default_schema.tables["table_2"]["x-normalizer"]["seen-data"] is True + assert pipeline.default_schema.tables["table_1"]["x-normalizer"]["seen-data"] is True job_client, _ = pipeline._get_destination_clients(schema) diff --git a/tests/load/pipeline/test_postgres.py b/tests/load/pipeline/test_postgres.py index a64ee300cd..a4001b7faa 100644 --- a/tests/load/pipeline/test_postgres.py +++ b/tests/load/pipeline/test_postgres.py @@ -6,45 +6,11 @@ from dlt.common.utils import uniq_id -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -from tests.cases import arrow_table_all_data_types, prepare_shuffled_tables -from tests.pipeline.utils import assert_data_table_counts, assert_load_info, load_tables_to_dicts +from tests.load.utils import destinations_configs, DestinationTestConfiguration +from tests.pipeline.utils import assert_load_info, load_tables_to_dicts from tests.utils import TestDataItemFormat -@pytest.mark.parametrize( - "destination_config", - destinations_configs(default_sql_configs=True, subset=["postgres"]), - ids=lambda x: x.name, -) -@pytest.mark.parametrize("item_type", ["object", "table"]) -def test_postgres_load_csv( - destination_config: DestinationTestConfiguration, item_type: TestDataItemFormat -) -> None: - os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" - pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), full_refresh=True) - table, shuffled_table, shuffled_removed_column = prepare_shuffled_tables() - - # convert to pylist when loading from objects, this will kick the csv-reader in - if item_type == "object": - table, shuffled_table, shuffled_removed_column = ( - table.to_pylist(), - shuffled_table.to_pylist(), - shuffled_removed_column.to_pylist(), - ) - - load_info = pipeline.run( - [shuffled_removed_column, shuffled_table, table], - table_name="table", - loader_file_format="csv", - ) - assert_load_info(load_info) - job = load_info.load_packages[0].jobs["completed_jobs"][0].file_path - assert job.endswith("csv") - assert_data_table_counts(pipeline, {"table": 5432 * 3}) - load_tables_to_dicts(pipeline, "table") - - @pytest.mark.parametrize( "destination_config", destinations_configs(default_sql_configs=True, subset=["postgres"]), @@ -64,7 +30,7 @@ def test_postgres_encoded_binary( blob_table = blob_table.to_pylist() print(blob_table) - pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), dev_mode=True) load_info = pipeline.run(blob_table, table_name="table", loader_file_format="csv") assert_load_info(load_info) job = load_info.load_packages[0].jobs["completed_jobs"][0].file_path @@ -76,31 +42,3 @@ def test_postgres_encoded_binary( # print(bytes(data["table"][0]["hash"])) # data in postgres equals unencoded blob assert data["table"][0]["hash"].tobytes() == blob - - -@pytest.mark.parametrize( - "destination_config", - destinations_configs(default_sql_configs=True, subset=["postgres"]), - ids=lambda x: x.name, -) -def test_postgres_empty_csv_from_arrow(destination_config: DestinationTestConfiguration) -> None: - os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" - os.environ["RESTORE_FROM_DESTINATION"] = "False" - pipeline = destination_config.setup_pipeline("postgres_" + uniq_id(), full_refresh=True) - table, _, _ = arrow_table_all_data_types("arrow-table", include_json=False) - - load_info = pipeline.run( - table.schema.empty_table(), table_name="table", loader_file_format="csv" - ) - assert_load_info(load_info) - assert len(load_info.load_packages[0].jobs["completed_jobs"]) == 1 - job = load_info.load_packages[0].jobs["completed_jobs"][0].file_path - assert job.endswith("csv") - assert_data_table_counts(pipeline, {"table": 0}) - with pipeline.sql_client() as client: - with client.execute_query('SELECT * FROM "table"') as cur: - columns = [col.name for col in cur.description] - assert len(cur.fetchall()) == 0 - - # all columns in order - assert columns == list(pipeline.default_schema.get_table_columns("table").keys()) diff --git a/tests/load/pipeline/test_redshift.py b/tests/load/pipeline/test_redshift.py index 29293693f5..bfdc15459c 100644 --- a/tests/load/pipeline/test_redshift.py +++ b/tests/load/pipeline/test_redshift.py @@ -4,7 +4,7 @@ import dlt from dlt.common.utils import uniq_id -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration +from tests.load.utils import destinations_configs, DestinationTestConfiguration from tests.cases import table_update_and_row, assert_all_data_types_row from tests.pipeline.utils import assert_load_info diff --git a/tests/load/pipeline/test_refresh_modes.py b/tests/load/pipeline/test_refresh_modes.py index 02ed560068..de557ba118 100644 --- a/tests/load/pipeline/test_refresh_modes.py +++ b/tests/load/pipeline/test_refresh_modes.py @@ -8,7 +8,7 @@ from dlt.common.typing import DictStrAny from dlt.common.pipeline import pipeline_state as current_pipeline_state -from tests.utils import clean_test_storage, preserve_environ +from tests.utils import clean_test_storage from tests.pipeline.utils import ( assert_load_info, load_tables_to_dicts, diff --git a/tests/load/pipeline/test_replace_disposition.py b/tests/load/pipeline/test_replace_disposition.py index 464b5aea1f..12bc69abe0 100644 --- a/tests/load/pipeline/test_replace_disposition.py +++ b/tests/load/pipeline/test_replace_disposition.py @@ -4,12 +4,12 @@ from dlt.common.utils import uniq_id from tests.pipeline.utils import assert_load_info, load_table_counts, load_tables_to_dicts -from tests.load.pipeline.utils import ( +from tests.load.utils import ( drop_active_pipeline_data, destinations_configs, DestinationTestConfiguration, - REPLACE_STRATEGIES, ) +from tests.load.pipeline.utils import REPLACE_STRATEGIES @pytest.mark.essential diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index b287619e8c..37f999ff86 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -6,7 +6,9 @@ import dlt from dlt.common import pendulum -from dlt.common.schema.schema import Schema +from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.schema.schema import Schema, utils +from dlt.common.schema.utils import normalize_table_identifiers from dlt.common.utils import uniq_id from dlt.common.destination.exceptions import DestinationUndefinedEntity @@ -14,7 +16,6 @@ from dlt.pipeline.exceptions import SqlClientNotAvailable from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.state_sync import ( - STATE_TABLE_COLUMNS, load_pipeline_state_from_destination, state_resource, ) @@ -24,12 +25,12 @@ from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_DECODED from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V9, yml_case_path as common_yml_case_path from tests.common.configuration.utils import environment -from tests.load.pipeline.utils import drop_active_pipeline_data from tests.pipeline.utils import assert_query_data from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, get_normalized_dataset_name, + drop_active_pipeline_data, ) @@ -77,15 +78,17 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - initial_state["_local"]["_last_extracted_at"] = pendulum.now() initial_state["_local"]["_last_extracted_hash"] = initial_state["_version_hash"] # add _dlt_id and _dlt_load_id - resource, _ = state_resource(initial_state) + resource, _ = state_resource(initial_state, "not_used_load_id") resource.apply_hints( columns={ "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}, "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text", "nullable": False}, - **STATE_TABLE_COLUMNS, + **utils.pipeline_state_table()["columns"], } ) - schema.update_table(schema.normalize_table_identifiers(resource.compute_table_schema())) + schema.update_table( + normalize_table_identifiers(resource.compute_table_schema(), schema.naming) + ) # do not bump version here or in sync_schema, dlt won't recognize that schema changed and it won't update it in storage # so dlt in normalize stage infers _state_version table again but with different column order and the column order in schema is different # then in database. parquet is created in schema order and in Redshift it must exactly match the order. @@ -183,6 +186,7 @@ def test_silently_skip_on_invalid_credentials( destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name) +@pytest.mark.essential @pytest.mark.parametrize( "destination_config", destinations_configs( @@ -191,13 +195,25 @@ def test_silently_skip_on_invalid_credentials( ids=lambda x: x.name, ) @pytest.mark.parametrize("use_single_dataset", [True, False]) +@pytest.mark.parametrize( + "naming_convention", + [ + "tests.common.cases.normalizers.title_case", + "snake_case", + ], +) def test_get_schemas_from_destination( - destination_config: DestinationTestConfiguration, use_single_dataset: bool + destination_config: DestinationTestConfiguration, + use_single_dataset: bool, + naming_convention: str, ) -> None: + set_naming_env(destination_config.destination, naming_convention) + pipeline_name = "pipe_" + uniq_id() dataset_name = "state_test_" + uniq_id() p = destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name) + assert_naming_to_caps(destination_config.destination, p.destination.capabilities()) p.config.use_single_dataset = use_single_dataset def _make_dn_name(schema_name: str) -> str: @@ -268,18 +284,34 @@ def _make_dn_name(schema_name: str) -> str: assert len(restored_schemas) == 3 +@pytest.mark.essential @pytest.mark.parametrize( "destination_config", destinations_configs( - default_sql_configs=True, default_vector_configs=True, all_buckets_filesystem_configs=True + default_sql_configs=True, + all_staging_configs=True, + default_vector_configs=True, + all_buckets_filesystem_configs=True, ), ids=lambda x: x.name, ) -def test_restore_state_pipeline(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize( + "naming_convention", + [ + "tests.common.cases.normalizers.title_case", + "snake_case", + ], +) +def test_restore_state_pipeline( + destination_config: DestinationTestConfiguration, naming_convention: str +) -> None: + set_naming_env(destination_config.destination, naming_convention) + # enable restoring from destination os.environ["RESTORE_FROM_DESTINATION"] = "True" pipeline_name = "pipe_" + uniq_id() dataset_name = "state_test_" + uniq_id() p = destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name) + assert_naming_to_caps(destination_config.destination, p.destination.capabilities()) def some_data_gen(param: str) -> Any: dlt.current.source_state()[param] = param @@ -366,7 +398,7 @@ def some_data(): p = destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name) # now attach locally os.environ["RESTORE_FROM_DESTINATION"] = "True" - p = dlt.attach(pipeline_name=pipeline_name) + p = destination_config.attach_pipeline(pipeline_name=pipeline_name) assert p.dataset_name == dataset_name assert p.default_schema_name is None # restore @@ -451,6 +483,9 @@ def test_restore_schemas_while_import_schemas_exist( # make sure schema got imported schema = p.schemas["ethereum"] assert "blocks" in schema.tables + # allow to modify tables even if naming convention is changed. some of the tables in ethereum schema + # have processing hints that lock the table schema. so when weaviate changes naming convention we have an exception + os.environ["SCHEMA__ALLOW_IDENTIFIER_CHANGE_ON_TABLE_WITH_DATA"] = "true" # extract some additional data to upgrade schema in the pipeline p.run( @@ -467,7 +502,7 @@ def test_restore_schemas_while_import_schemas_exist( assert normalized_labels in schema.tables # re-attach the pipeline - p = dlt.attach(pipeline_name=pipeline_name) + p = destination_config.attach_pipeline(pipeline_name=pipeline_name) p.run( ["C", "D", "E"], table_name="annotations", loader_file_format=destination_config.file_format ) @@ -496,7 +531,7 @@ def test_restore_schemas_while_import_schemas_exist( assert normalized_annotations in schema.tables # check if attached to import schema - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V9() # extract some data with restored pipeline p.run( ["C", "D", "E"], table_name="blacklist", loader_file_format=destination_config.file_format @@ -604,7 +639,9 @@ def some_data(param: str) -> Any: prod_state = production_p.state assert p.state["_state_version"] == prod_state["_state_version"] - 1 # re-attach production and sync - ra_production_p = dlt.attach(pipeline_name=pipeline_name, pipelines_dir=TEST_STORAGE_ROOT) + ra_production_p = destination_config.attach_pipeline( + pipeline_name=pipeline_name, pipelines_dir=TEST_STORAGE_ROOT + ) ra_production_p.sync_destination() # state didn't change because production is ahead of local with its version # nevertheless this is potentially dangerous situation 🤷 @@ -613,10 +650,18 @@ def some_data(param: str) -> Any: # get all the states, notice version 4 twice (one from production, the other from local) try: with p.sql_client() as client: + # use sql_client to escape identifiers properly state_table = client.make_qualified_table_name(p.default_schema.state_table_name) - + c_version = client.escape_column_name( + p.default_schema.naming.normalize_identifier("version") + ) + c_created_at = client.escape_column_name( + p.default_schema.naming.normalize_identifier("created_at") + ) assert_query_data( - p, f"SELECT version FROM {state_table} ORDER BY created_at DESC", [5, 4, 4, 3, 2] + p, + f"SELECT {c_version} FROM {state_table} ORDER BY {c_created_at} DESC", + [5, 4, 4, 3, 2], ) except SqlClientNotAvailable: pytest.skip(f"destination {destination_config.destination} does not support sql client") @@ -669,7 +714,7 @@ def some_data(param: str) -> Any: assert p.dataset_name == dataset_name print("---> no state sync last attach") - p = dlt.attach(pipeline_name=pipeline_name) + p = destination_config.attach_pipeline(pipeline_name=pipeline_name) # this will prevent from creating of _dlt_pipeline_state p.config.restore_from_destination = False data4 = some_data("state4") @@ -686,7 +731,7 @@ def some_data(param: str) -> Any: assert p.state["_local"]["first_run"] is False # attach again to make the `run` method check the destination print("---> last attach") - p = dlt.attach(pipeline_name=pipeline_name) + p = destination_config.attach_pipeline(pipeline_name=pipeline_name) p.config.restore_from_destination = True data5 = some_data("state4") data5.apply_hints(table_name="state1_data5") @@ -696,8 +741,31 @@ def some_data(param: str) -> Any: def prepare_import_folder(p: Pipeline) -> None: - os.makedirs(p._schema_storage.config.import_schema_path, exist_ok=True) - shutil.copy( - common_yml_case_path("schemas/eth/ethereum_schema_v5"), - os.path.join(p._schema_storage.config.import_schema_path, "ethereum.schema.yaml"), - ) + from tests.common.storages.utils import prepare_eth_import_folder + + prepare_eth_import_folder(p._schema_storage) + + +def set_naming_env(destination: str, naming_convention: str) -> None: + # snake case is for default convention so do not set it + if naming_convention != "snake_case": + # path convention to test weaviate ci_naming + if destination == "weaviate": + if naming_convention.endswith("sql_upper"): + pytest.skip(f"{naming_convention} not supported on weaviate") + else: + naming_convention = "dlt.destinations.impl.weaviate.ci_naming" + os.environ["SCHEMA__NAMING"] = naming_convention + + +def assert_naming_to_caps(destination: str, caps: DestinationCapabilitiesContext) -> None: + naming = Schema("test").naming + if ( + not caps.has_case_sensitive_identifiers + and caps.casefold_identifier is not str + and naming.is_case_sensitive + ): + pytest.skip( + f"Skipping for case insensitive destination {destination} with case folding because" + f" naming {naming.name()} is case sensitive" + ) diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py index e8baa33ff3..b33c5a2590 100644 --- a/tests/load/pipeline/test_scd2.py +++ b/tests/load/pipeline/test_scd2.py @@ -17,12 +17,11 @@ from dlt.pipeline.exceptions import PipelineStepFailed from tests.cases import arrow_table_all_data_types -from tests.pipeline.utils import assert_load_info, load_table_counts -from tests.load.pipeline.utils import ( +from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, ) -from tests.pipeline.utils import load_tables_to_dicts +from tests.pipeline.utils import load_tables_to_dicts, assert_load_info, load_table_counts from tests.utils import TPythonTableFormat @@ -104,7 +103,7 @@ def test_core_functionality( validity_column_names: List[str], active_record_timestamp: Optional[pendulum.DateTime], ) -> None: - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource( table_name="dim_test", @@ -243,7 +242,7 @@ def r(data): ) @pytest.mark.parametrize("simple", [True, False]) def test_child_table(destination_config: DestinationTestConfiguration, simple: bool) -> None: - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource( table_name="dim_test", write_disposition={"disposition": "merge", "strategy": "scd2"} @@ -386,7 +385,7 @@ def r(data): ids=lambda x: x.name, ) def test_grandchild_table(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource( table_name="dim_test", write_disposition={"disposition": "merge", "strategy": "scd2"} @@ -479,7 +478,7 @@ def r(data): ids=lambda x: x.name, ) def test_validity_column_name_conflict(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource( table_name="dim_test", @@ -525,7 +524,7 @@ def test_active_record_timestamp( destination_config: DestinationTestConfiguration, active_record_timestamp: Optional[TAnyDateTime], ) -> None: - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource( table_name="dim_test", @@ -572,7 +571,7 @@ def _make_scd2_r(table_: Any) -> DltResource: }, ).add_map(add_row_hash_to_table("row_hash")) - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) info = p.run(_make_scd2_r(table), loader_file_format=destination_config.file_format) assert_load_info(info) # make sure we have scd2 columns in schema @@ -608,7 +607,7 @@ def _make_scd2_r(table_: Any) -> DltResource: ids=lambda x: x.name, ) def test_user_provided_row_hash(destination_config: DestinationTestConfiguration) -> None: - p = destination_config.setup_pipeline("abstract", full_refresh=True) + p = destination_config.setup_pipeline("abstract", dev_mode=True) @dlt.resource( table_name="dim_test", diff --git a/tests/load/pipeline/test_snowflake_pipeline.py b/tests/load/pipeline/test_snowflake_pipeline.py new file mode 100644 index 0000000000..3cfa9e8b21 --- /dev/null +++ b/tests/load/pipeline/test_snowflake_pipeline.py @@ -0,0 +1,55 @@ +import pytest + +import dlt +from dlt.common import Decimal + +from dlt.common.utils import uniq_id +from dlt.destinations.exceptions import DatabaseUndefinedRelation +from tests.pipeline.utils import assert_load_info +from tests.load.utils import destinations_configs, DestinationTestConfiguration + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["snowflake"]), + ids=lambda x: x.name, +) +def test_snowflake_case_sensitive_identifiers( + destination_config: DestinationTestConfiguration, +) -> None: + snow_ = dlt.destinations.snowflake(naming_convention="sql_cs_v1") + + dataset_name = "CaseSensitive_Dataset_" + uniq_id() + pipeline = destination_config.setup_pipeline( + "test_snowflake_case_sensitive_identifiers", dataset_name=dataset_name, destination=snow_ + ) + caps = pipeline.destination.capabilities() + assert caps.naming_convention == "sql_cs_v1" + + destination_client = pipeline.destination_client() + # assert snowflake caps to be in case sensitive mode + assert destination_client.capabilities.casefold_identifier is str + + # load some case sensitive data + info = pipeline.run([{"Id": 1, "Capital": 0.0}], table_name="Expenses") + assert_load_info(info) + with pipeline.sql_client() as client: + assert client.has_dataset() + # use the same case sensitive dataset + with client.with_alternative_dataset_name(dataset_name): + assert client.has_dataset() + # make it case insensitive (upper) + with client.with_alternative_dataset_name(dataset_name.upper()): + assert not client.has_dataset() + # keep case sensitive but make lowercase + with client.with_alternative_dataset_name(dataset_name.lower()): + assert not client.has_dataset() + + # must use quoted identifiers + rows = client.execute_sql('SELECT "Id", "Capital" FROM "Expenses"') + print(rows) + with pytest.raises(DatabaseUndefinedRelation): + client.execute_sql('SELECT "Id", "Capital" FROM Expenses') diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index e0e2154b57..7f1427f20f 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -8,15 +8,13 @@ from dlt.common.schema.typing import TDataType from tests.load.pipeline.test_merge_disposition import github -from tests.pipeline.utils import load_table_counts -from tests.pipeline.utils import assert_load_info +from tests.pipeline.utils import load_table_counts, assert_load_info from tests.load.utils import ( - TABLE_ROW_ALL_DATA_TYPES, - TABLE_UPDATE_COLUMNS_SCHEMA, + destinations_configs, + DestinationTestConfiguration, assert_all_data_types_row, ) from tests.cases import table_update_and_row -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @dlt.resource( @@ -65,12 +63,17 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: ) == 4 ) + # pipeline state is loaded with preferred format, so allows (possibly) for two job formats + caps = pipeline.destination.capabilities() + # NOTE: preferred_staging_file_format goes first because here we test staged loading and + # default caps will be modified so preferred_staging_file_format is used as main + preferred_format = caps.preferred_staging_file_format or caps.preferred_loader_file_format assert ( len( [ x for x in package_info.jobs["completed_jobs"] - if x.job_file_info.file_format == destination_config.file_format + if x.job_file_info.file_format in (destination_config.file_format, preferred_format) ] ) == 4 diff --git a/tests/load/pipeline/test_write_disposition_changes.py b/tests/load/pipeline/test_write_disposition_changes.py index 16c589352e..ba2f6bf172 100644 --- a/tests/load/pipeline/test_write_disposition_changes.py +++ b/tests/load/pipeline/test_write_disposition_changes.py @@ -1,7 +1,7 @@ import pytest import dlt from typing import Any -from tests.load.pipeline.utils import ( +from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, ) @@ -124,9 +124,13 @@ def source(): ) # schemaless destinations allow adding of root key without the pipeline failing - # for now this is only the case for dremio + # they do not mind adding NOT NULL columns to tables with existing data (id NOT NULL is supported at all) # doing this will result in somewhat useless behavior - destination_allows_adding_root_key = destination_config.destination in ["dremio", "clickhouse"] + destination_allows_adding_root_key = destination_config.destination in [ + "dremio", + "clickhouse", + "athena", + ] if destination_allows_adding_root_key and not with_root_key: pipeline.run( diff --git a/tests/load/pipeline/utils.py b/tests/load/pipeline/utils.py index d762029ddd..679c2d6da9 100644 --- a/tests/load/pipeline/utils.py +++ b/tests/load/pipeline/utils.py @@ -1,67 +1 @@ -from typing import Any, Iterator, List, Sequence, TYPE_CHECKING, Callable -import pytest - -import dlt -from dlt.common.destination.reference import WithStagingDataset - -from dlt.common.configuration.container import Container -from dlt.common.pipeline import LoadInfo, PipelineContext - -from tests.load.utils import DestinationTestConfiguration, destinations_configs -from dlt.destinations.exceptions import CantExtractTablePrefix - -if TYPE_CHECKING: - from dlt.destinations.impl.filesystem.filesystem import FilesystemClient - REPLACE_STRATEGIES = ["truncate-and-insert", "insert-from-staging", "staging-optimized"] - - -@pytest.fixture(autouse=True) -def drop_pipeline(request) -> Iterator[None]: - yield - if "no_load" in request.keywords: - return - try: - drop_active_pipeline_data() - except CantExtractTablePrefix: - # for some tests we test that this exception is raised, - # so we suppress it here - pass - - -def drop_active_pipeline_data() -> None: - """Drops all the datasets for currently active pipeline, wipes the working folder and then deactivated it.""" - if Container()[PipelineContext].is_active(): - # take existing pipeline - p = dlt.pipeline() - - def _drop_dataset(schema_name: str) -> None: - with p.destination_client(schema_name) as client: - try: - client.drop_storage() - print("dropped") - except Exception as exc: - print(exc) - if isinstance(client, WithStagingDataset): - with client.with_staging_dataset(): - try: - client.drop_storage() - print("staging dropped") - except Exception as exc: - print(exc) - - # drop_func = _drop_dataset_fs if _is_filesystem(p) else _drop_dataset_sql - # take all schemas and if destination was set - if p.destination: - if p.config.use_single_dataset: - # drop just the dataset for default schema - if p.default_schema_name: - _drop_dataset(p.default_schema_name) - else: - # for each schema, drop the dataset - for schema_name in p.schema_names: - _drop_dataset(schema_name) - - # p._wipe_working_folder() - # deactivate context - Container()[PipelineContext].deactivate() diff --git a/tests/load/postgres/test_postgres_client.py b/tests/load/postgres/test_postgres_client.py index a0fbd85b5b..d8cd996dcf 100644 --- a/tests/load/postgres/test_postgres_client.py +++ b/tests/load/postgres/test_postgres_client.py @@ -11,7 +11,7 @@ from dlt.destinations.impl.postgres.postgres import PostgresClient from dlt.destinations.impl.postgres.sql_client import psycopg2 -from tests.utils import TEST_STORAGE_ROOT, delete_test_storage, skipifpypy, preserve_environ +from tests.utils import TEST_STORAGE_ROOT, delete_test_storage, skipifpypy from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage from tests.common.configuration.utils import environment diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 7566b8afce..5ba68be67c 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -4,8 +4,9 @@ from dlt.common.exceptions import TerminalValueError from dlt.common.utils import uniq_id -from dlt.common.schema import Schema +from dlt.common.schema import Schema, utils +from dlt.destinations import postgres from dlt.destinations.impl.postgres.postgres import PostgresClient from dlt.destinations.impl.postgres.configuration import ( PostgresClientConfiguration, @@ -25,13 +26,23 @@ @pytest.fixture def client(empty_schema: Schema) -> PostgresClient: + return create_client(empty_schema) + + +@pytest.fixture +def cs_client(empty_schema: Schema) -> PostgresClient: + # change normalizer to case sensitive + empty_schema._normalizers_config["names"] = "tests.common.cases.normalizers.title_case" + empty_schema.update_normalizers() + return create_client(empty_schema) + + +def create_client(empty_schema: Schema) -> PostgresClient: # return client without opening connection - return PostgresClient( - empty_schema, - PostgresClientConfiguration(credentials=PostgresCredentials())._bind_dataset_name( - dataset_name="test_" + uniq_id() - ), + config = PostgresClientConfiguration(credentials=PostgresCredentials())._bind_dataset_name( + dataset_name="test_" + uniq_id() ) + return postgres().client(empty_schema, config) def test_create_table(client: PostgresClient) -> None: @@ -102,7 +113,7 @@ def test_alter_table(client: PostgresClient) -> None: assert '"col11_precision" time (3) without time zone NOT NULL' in sql -def test_create_table_with_hints(client: PostgresClient) -> None: +def test_create_table_with_hints(client: PostgresClient, empty_schema: Schema) -> None: mod_update = deepcopy(TABLE_UPDATE) # timestamp mod_update[0]["primary_key"] = True @@ -119,8 +130,8 @@ def test_create_table_with_hints(client: PostgresClient) -> None: assert '"col4" timestamp with time zone NOT NULL' in sql # same thing without indexes - client = PostgresClient( - client.schema, + client = postgres().client( + empty_schema, PostgresClientConfiguration( create_indexes=False, credentials=PostgresCredentials(), @@ -129,3 +140,20 @@ def test_create_table_with_hints(client: PostgresClient) -> None: sql = client._get_table_update_sql("event_test_table", mod_update, False)[0] sqlfluff.parse(sql, dialect="postgres") assert '"col2" double precision NOT NULL' in sql + + +def test_create_table_case_sensitive(cs_client: PostgresClient) -> None: + cs_client.schema.update_table( + utils.new_table("event_test_table", columns=deepcopy(TABLE_UPDATE)) + ) + sql = cs_client._get_table_update_sql( + "Event_test_tablE", + list(cs_client.schema.get_table_columns("Event_test_tablE").values()), + False, + )[0] + sqlfluff.parse(sql, dialect="postgres") + # everything capitalized + assert cs_client.sql_client.fully_qualified_dataset_name(escape=False)[0] == "T" # Test + # every line starts with "Col" + for line in sql.split("\n")[1:]: + assert line.startswith('"Col') diff --git a/tests/load/qdrant/test_pipeline.py b/tests/load/qdrant/test_pipeline.py index d50b50282a..e0bd1fff97 100644 --- a/tests/load/qdrant/test_pipeline.py +++ b/tests/load/qdrant/test_pipeline.py @@ -5,6 +5,7 @@ from dlt.common import json from dlt.common.utils import uniq_id +from dlt.destinations.adapters import qdrant_adapter from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter, VECTORIZE_HINT from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient from tests.pipeline.utils import assert_load_info @@ -68,6 +69,8 @@ def some_data(): assert schema state = client.get_stored_state("test_pipeline_append") assert state + state = client.get_stored_state("unknown_pipeline") + assert state is None def test_pipeline_append() -> None: diff --git a/tests/load/qdrant/utils.py b/tests/load/qdrant/utils.py index 74d5db9715..3b12d15f86 100644 --- a/tests/load/qdrant/utils.py +++ b/tests/load/qdrant/utils.py @@ -20,16 +20,16 @@ def assert_collection( expected_items_count: int = None, items: List[Any] = None, ) -> None: - client: QdrantClient = pipeline.destination_client() # type: ignore[assignment] + client: QdrantClient + with pipeline.destination_client() as client: # type: ignore[assignment] + # Check if collection exists + exists = client._collection_exists(collection_name) + assert exists - # Check if collection exists - exists = client._collection_exists(collection_name) - assert exists - - qualified_collection_name = client._make_qualified_collection_name(collection_name) - point_records, offset = client.db_client.scroll( - qualified_collection_name, with_payload=True, limit=50 - ) + qualified_collection_name = client._make_qualified_collection_name(collection_name) + point_records, offset = client.db_client.scroll( + qualified_collection_name, with_payload=True, limit=50 + ) if expected_items_count is not None: assert expected_items_count == len(point_records) @@ -55,10 +55,11 @@ def has_collections(client): if Container()[PipelineContext].is_active(): # take existing pipeline p = dlt.pipeline() - client: QdrantClient = p.destination_client() # type: ignore[assignment] + client: QdrantClient - if has_collections(client): - client.drop_storage() + with p.destination_client() as client: # type: ignore[assignment] + if has_collections(client): + client.drop_storage() p._wipe_working_folder() # deactivate context diff --git a/tests/load/redshift/test_redshift_client.py b/tests/load/redshift/test_redshift_client.py index 03bb57c3b4..bb923df673 100644 --- a/tests/load/redshift/test_redshift_client.py +++ b/tests/load/redshift/test_redshift_client.py @@ -6,13 +6,18 @@ from dlt.common import json, pendulum from dlt.common.configuration.resolve import resolve_configuration +from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME from dlt.common.storages import FileStorage from dlt.common.storages.schema_storage import SchemaStorage from dlt.common.utils import uniq_id from dlt.destinations.exceptions import DatabaseTerminalException -from dlt.destinations.impl.redshift.configuration import RedshiftCredentials +from dlt.destinations import redshift +from dlt.destinations.impl.redshift.configuration import ( + RedshiftCredentials, + RedshiftClientConfiguration, +) from dlt.destinations.impl.redshift.redshift import RedshiftClient, psycopg2 from tests.common.utils import COMMON_TEST_CASES_PATH @@ -42,6 +47,34 @@ def test_postgres_and_redshift_credentials_defaults() -> None: assert red_cred.port == 5439 +def test_redshift_factory() -> None: + schema = Schema("schema") + dest = redshift() + client = dest.client(schema, RedshiftClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.staging_iam_role is None + assert client.config.has_case_sensitive_identifiers is False + assert client.capabilities.has_case_sensitive_identifiers is False + assert client.capabilities.casefold_identifier is str.lower + + # set args explicitly + dest = redshift(has_case_sensitive_identifiers=True, staging_iam_role="LOADER") + client = dest.client(schema, RedshiftClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.staging_iam_role == "LOADER" + assert client.config.has_case_sensitive_identifiers is True + assert client.capabilities.has_case_sensitive_identifiers is True + assert client.capabilities.casefold_identifier is str + + # set args via config + os.environ["DESTINATION__STAGING_IAM_ROLE"] = "LOADER" + os.environ["DESTINATION__HAS_CASE_SENSITIVE_IDENTIFIERS"] = "True" + dest = redshift() + client = dest.client(schema, RedshiftClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.staging_iam_role == "LOADER" + assert client.config.has_case_sensitive_identifiers is True + assert client.capabilities.has_case_sensitive_identifiers is True + assert client.capabilities.casefold_identifier is str + + @skipifpypy def test_text_too_long(client: RedshiftClient, file_storage: FileStorage) -> None: caps = client.capabilities diff --git a/tests/load/redshift/test_redshift_table_builder.py b/tests/load/redshift/test_redshift_table_builder.py index 2427bc7cfe..de6f450134 100644 --- a/tests/load/redshift/test_redshift_table_builder.py +++ b/tests/load/redshift/test_redshift_table_builder.py @@ -6,6 +6,7 @@ from dlt.common.schema import Schema from dlt.common.configuration import resolve_configuration +from dlt.destinations import redshift from dlt.destinations.impl.redshift.redshift import RedshiftClient from dlt.destinations.impl.redshift.configuration import ( RedshiftClientConfiguration, @@ -21,7 +22,7 @@ @pytest.fixture def client(empty_schema: Schema) -> RedshiftClient: # return client without opening connection - return RedshiftClient( + return redshift().client( empty_schema, RedshiftClientConfiguration(credentials=RedshiftCredentials())._bind_dataset_name( dataset_name="test_" + uniq_id() diff --git a/tests/load/snowflake/test_snowflake_configuration.py b/tests/load/snowflake/test_snowflake_configuration.py index 691f0b5a64..10d93d104c 100644 --- a/tests/load/snowflake/test_snowflake_configuration.py +++ b/tests/load/snowflake/test_snowflake_configuration.py @@ -121,10 +121,10 @@ def test_only_authenticator() -> None: } -def test_no_query(environment) -> None: - c = SnowflakeCredentials("snowflake://user1:pass1@host1/db1") - assert str(c.to_url()) == "snowflake://user1:pass1@host1/db1" - print(c.to_url()) +# def test_no_query(environment) -> None: +# c = SnowflakeCredentials("snowflake://user1:pass1@host1/db1") +# assert str(c.to_url()) == "snowflake://user1:pass1@host1/db1" +# print(c.to_url()) def test_query_additional_params() -> None: diff --git a/tests/load/snowflake/test_snowflake_table_builder.py b/tests/load/snowflake/test_snowflake_table_builder.py index bdbe888fb5..4bb69085da 100644 --- a/tests/load/snowflake/test_snowflake_table_builder.py +++ b/tests/load/snowflake/test_snowflake_table_builder.py @@ -5,12 +5,12 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema +from dlt.destinations import snowflake from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient from dlt.destinations.impl.snowflake.configuration import ( SnowflakeClientConfiguration, SnowflakeCredentials, ) -from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate from tests.load.utils import TABLE_UPDATE, empty_schema @@ -22,7 +22,7 @@ def snowflake_client(empty_schema: Schema) -> SnowflakeClient: # return client without opening connection creds = SnowflakeCredentials() - return SnowflakeClient( + return snowflake().client( empty_schema, SnowflakeClientConfiguration(credentials=creds)._bind_dataset_name( dataset_name="test_" + uniq_id() diff --git a/tests/load/synapse/test_synapse_configuration.py b/tests/load/synapse/test_synapse_configuration.py index f366d87d09..8aaea03b0f 100644 --- a/tests/load/synapse/test_synapse_configuration.py +++ b/tests/load/synapse/test_synapse_configuration.py @@ -1,8 +1,11 @@ +import os import pytest from dlt.common.configuration import resolve_configuration from dlt.common.exceptions import SystemConfigurationException +from dlt.common.schema import Schema +from dlt.destinations import synapse from dlt.destinations.impl.synapse.configuration import ( SynapseClientConfiguration, SynapseCredentials, @@ -14,7 +17,42 @@ def test_synapse_configuration() -> None: # By default, unique indexes should not be created. - assert SynapseClientConfiguration().create_indexes is False + c = SynapseClientConfiguration() + assert c.create_indexes is False + assert c.has_case_sensitive_identifiers is False + assert c.staging_use_msi is False + + +def test_synapse_factory() -> None: + schema = Schema("schema") + dest = synapse() + client = dest.client(schema, SynapseClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.create_indexes is False + assert client.config.staging_use_msi is False + assert client.config.has_case_sensitive_identifiers is False + assert client.capabilities.has_case_sensitive_identifiers is False + assert client.capabilities.casefold_identifier is str + + # set args explicitly + dest = synapse(has_case_sensitive_identifiers=True, create_indexes=True, staging_use_msi=True) + client = dest.client(schema, SynapseClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.create_indexes is True + assert client.config.staging_use_msi is True + assert client.config.has_case_sensitive_identifiers is True + assert client.capabilities.has_case_sensitive_identifiers is True + assert client.capabilities.casefold_identifier is str + + # set args via config + os.environ["DESTINATION__CREATE_INDEXES"] = "True" + os.environ["DESTINATION__STAGING_USE_MSI"] = "True" + os.environ["DESTINATION__HAS_CASE_SENSITIVE_IDENTIFIERS"] = "True" + dest = synapse() + client = dest.client(schema, SynapseClientConfiguration()._bind_dataset_name("dataset")) + assert client.config.create_indexes is True + assert client.config.staging_use_msi is True + assert client.config.has_case_sensitive_identifiers is True + assert client.capabilities.has_case_sensitive_identifiers is True + assert client.capabilities.casefold_identifier is str def test_parse_native_representation() -> None: diff --git a/tests/load/synapse/test_synapse_table_builder.py b/tests/load/synapse/test_synapse_table_builder.py index 9ee2ebe202..1a92a20f1e 100644 --- a/tests/load/synapse/test_synapse_table_builder.py +++ b/tests/load/synapse/test_synapse_table_builder.py @@ -7,17 +7,18 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema, TColumnHint -from dlt.destinations.impl.synapse.synapse import SynapseClient +from dlt.destinations import synapse +from dlt.destinations.impl.synapse.synapse import ( + SynapseClient, + HINT_TO_SYNAPSE_ATTR, + TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR, +) from dlt.destinations.impl.synapse.configuration import ( SynapseClientConfiguration, SynapseCredentials, ) from tests.load.utils import TABLE_UPDATE, empty_schema -from dlt.destinations.impl.synapse.synapse import ( - HINT_TO_SYNAPSE_ATTR, - TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR, -) # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -26,7 +27,7 @@ @pytest.fixture def client(empty_schema: Schema) -> SynapseClient: # return client without opening connection - client = SynapseClient( + client = synapse().client( empty_schema, SynapseClientConfiguration(credentials=SynapseCredentials())._bind_dataset_name( dataset_name="test_" + uniq_id() @@ -39,7 +40,7 @@ def client(empty_schema: Schema) -> SynapseClient: @pytest.fixture def client_with_indexes_enabled(empty_schema: Schema) -> SynapseClient: # return client without opening connection - client = SynapseClient( + client = synapse().client( empty_schema, SynapseClientConfiguration( credentials=SynapseCredentials(), create_indexes=True diff --git a/tests/load/synapse/test_synapse_table_indexing.py b/tests/load/synapse/test_synapse_table_indexing.py index a9d426ad4a..d877b769cc 100644 --- a/tests/load/synapse/test_synapse_table_indexing.py +++ b/tests/load/synapse/test_synapse_table_indexing.py @@ -1,20 +1,14 @@ import os import pytest from typing import Iterator, List, Any, Union -from textwrap import dedent import dlt from dlt.common.schema import TColumnSchema -from dlt.destinations.sql_client import SqlClientBase - -from dlt.destinations.impl.synapse import synapse_adapter +from dlt.destinations.adapters import synapse_adapter from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType from tests.load.utils import TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES -from tests.load.pipeline.utils import ( - drop_pipeline, -) # this import ensures all test data gets removed from tests.load.synapse.utils import get_storage_table_index_type # mark all tests as essential, do not remove diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index 30de51f069..be917672f1 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -4,11 +4,11 @@ from unittest import mock import pytest from unittest.mock import patch -from typing import List +from typing import List, Tuple from dlt.common.exceptions import TerminalException, TerminalValueError from dlt.common.storages import FileStorage, PackageStorage, ParsedLoadJobFileName -from dlt.common.storages.load_package import LoadJobInfo +from dlt.common.storages.load_package import LoadJobInfo, TJobState from dlt.common.storages.load_storage import JobFileFormatUnsupported from dlt.common.destination.reference import LoadJob, TDestination from dlt.common.schema.utils import ( @@ -31,7 +31,6 @@ clean_test_storage, init_test_logging, TEST_DICT_CONFIG_PROVIDER, - preserve_environ, ) from tests.load.utils import prepare_load_package from tests.utils import skip_if_not_active, TEST_STORAGE_ROOT @@ -97,15 +96,11 @@ def test_unsupported_write_disposition() -> None: with ThreadPoolExecutor() as pool: load.run(pool) # job with unsupported write disp. is failed - exception_file = [ - f - for f in load.load_storage.normalized_packages.list_failed_jobs(load_id) - if f.endswith(".exception") - ][0] - assert ( - "LoadClientUnsupportedWriteDisposition" - in load.load_storage.normalized_packages.storage.load(exception_file) + failed_job = load.load_storage.normalized_packages.list_failed_jobs(load_id)[0] + failed_message = load.load_storage.normalized_packages.get_job_failed_message( + load_id, ParsedLoadJobFileName.parse(failed_job) ) + assert "LoadClientUnsupportedWriteDisposition" in failed_message def test_get_new_jobs_info() -> None: @@ -125,7 +120,7 @@ def test_get_completed_table_chain_single_job_per_table() -> None: schema.tables[table_name] = fill_hints_from_parent_and_clone_table(schema.tables, table) top_job_table = get_top_level_table(schema.tables, "event_user") - all_jobs = load.load_storage.normalized_packages.list_all_jobs(load_id) + all_jobs = load.load_storage.normalized_packages.list_all_jobs_with_states(load_id) assert get_completed_table_chain(schema, all_jobs, top_job_table) is None # fake being completed assert ( @@ -144,12 +139,12 @@ def test_get_completed_table_chain_single_job_per_table() -> None: load.load_storage.normalized_packages.start_job( load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl" ) - all_jobs = load.load_storage.normalized_packages.list_all_jobs(load_id) + all_jobs = load.load_storage.normalized_packages.list_all_jobs_with_states(load_id) assert get_completed_table_chain(schema, all_jobs, loop_top_job_table) is None load.load_storage.normalized_packages.complete_job( load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl" ) - all_jobs = load.load_storage.normalized_packages.list_all_jobs(load_id) + all_jobs = load.load_storage.normalized_packages.list_all_jobs_with_states(load_id) assert get_completed_table_chain(schema, all_jobs, loop_top_job_table) == [ schema.get_table("event_loop_interrupted") ] @@ -485,9 +480,7 @@ def test_extend_table_chain() -> None: # no jobs for bot assert _extend_tables_with_table_chain(schema, ["event_bot"], ["event_user"]) == set() # skip unseen tables - del schema.tables["event_user__parse_data__entities"][ # type:ignore[typeddict-item] - "x-normalizer" - ] + del schema.tables["event_user__parse_data__entities"]["x-normalizer"] entities_chain = { name for name in schema.data_table_names() @@ -533,25 +526,15 @@ def test_get_completed_table_chain_cases() -> None: # child completed, parent not event_user = schema.get_table("event_user") event_user_entities = schema.get_table("event_user__parse_data__entities") - event_user_job = LoadJobInfo( + event_user_job: Tuple[TJobState, ParsedLoadJobFileName] = ( "started_jobs", - "path", - 0, - None, - 0, ParsedLoadJobFileName("event_user", "event_user_id", 0, "jsonl"), - None, ) - event_user_entities_job = LoadJobInfo( + event_user_entities_job: Tuple[TJobState, ParsedLoadJobFileName] = ( "completed_jobs", - "path", - 0, - None, - 0, ParsedLoadJobFileName( "event_user__parse_data__entities", "event_user__parse_data__entities_id", 0, "jsonl" ), - None, ) chain = get_completed_table_chain(schema, [event_user_job, event_user_entities_job], event_user) assert chain is None @@ -561,24 +544,21 @@ def test_get_completed_table_chain_cases() -> None: schema, [event_user_job, event_user_entities_job], event_user, - event_user_job.job_file_info.job_id(), + event_user_job[1].job_id(), ) # full chain assert chain == [event_user, event_user_entities] # parent failed, child completed chain = get_completed_table_chain( - schema, [event_user_job._replace(state="failed_jobs"), event_user_entities_job], event_user + schema, [("failed_jobs", event_user_job[1]), event_user_entities_job], event_user ) assert chain == [event_user, event_user_entities] # both failed chain = get_completed_table_chain( schema, - [ - event_user_job._replace(state="failed_jobs"), - event_user_entities_job._replace(state="failed_jobs"), - ], + [("failed_jobs", event_user_job[1]), ("failed_jobs", event_user_entities_job[1])], event_user, ) assert chain == [event_user, event_user_entities] @@ -589,16 +569,16 @@ def test_get_completed_table_chain_cases() -> None: event_user["write_disposition"] = w_d # type:ignore[typeddict-item] chain = get_completed_table_chain( - schema, [event_user_job], event_user, event_user_job.job_file_info.job_id() + schema, [event_user_job], event_user, event_user_job[1].job_id() ) assert chain == user_chain # but if child is present and incomplete... chain = get_completed_table_chain( schema, - [event_user_job, event_user_entities_job._replace(state="new_jobs")], + [event_user_job, ("new_jobs", event_user_entities_job[1])], event_user, - event_user_job.job_file_info.job_id(), + event_user_job[1].job_id(), ) # noting is returned assert chain is None @@ -607,9 +587,9 @@ def test_get_completed_table_chain_cases() -> None: deep_child = schema.tables[ "event_user__parse_data__response_selector__default__response__response_templates" ] - del deep_child["x-normalizer"] # type:ignore[typeddict-item] + del deep_child["x-normalizer"] chain = get_completed_table_chain( - schema, [event_user_job], event_user, event_user_job.job_file_info.job_id() + schema, [event_user_job], event_user, event_user_job[1].job_id() ) user_chain.remove(deep_child) assert chain == user_chain @@ -784,7 +764,7 @@ def assert_complete_job(load: Load, should_delete_completed: bool = False) -> No assert not load.load_storage.storage.has_folder( load.load_storage.get_normalized_package_path(load_id) ) - completed_path = load.load_storage.loaded_packages.get_job_folder_path( + completed_path = load.load_storage.loaded_packages.get_job_state_folder_path( load_id, "completed_jobs" ) if should_delete_completed: diff --git a/tests/load/test_insert_job_client.py b/tests/load/test_insert_job_client.py index 1c035f7f68..38155a8b09 100644 --- a/tests/load/test_insert_job_client.py +++ b/tests/load/test_insert_job_client.py @@ -11,10 +11,14 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient from tests.utils import TEST_STORAGE_ROOT, skipifpypy -from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage -from tests.load.pipeline.utils import destinations_configs +from tests.load.utils import ( + expect_load_file, + prepare_table, + yield_client_with_storage, + destinations_configs, +) -DEFAULT_SUBSET = ["duckdb", "redshift", "postgres", "mssql", "synapse"] +DEFAULT_SUBSET = ["duckdb", "redshift", "postgres", "mssql", "synapse", "motherduck"] @pytest.fixture @@ -176,7 +180,6 @@ def test_loading_errors(client: InsertValuesJobClient, file_storage: FileStorage ids=lambda x: x.name, ) def test_query_split(client: InsertValuesJobClient, file_storage: FileStorage) -> None: - mocked_caps = client.sql_client.__class__.capabilities writer_type = client.capabilities.insert_values_writer_type insert_sql = prepare_insert_statement(10, writer_type) @@ -185,10 +188,10 @@ def test_query_split(client: InsertValuesJobClient, file_storage: FileStorage) - elif writer_type == "select_union": pre, post, sep = ("SELECT ", "", " UNION ALL\n") + # caps are instance and are attr of sql client instance so it is safe to mock them + client.sql_client.capabilities.max_query_length = 2 # this guarantees that we execute inserts line by line - with patch.object(mocked_caps, "max_query_length", 2), patch.object( - client.sql_client, "execute_fragments" - ) as mocked_fragments: + with patch.object(client.sql_client, "execute_fragments") as mocked_fragments: user_table_name = prepare_table(client) expect_load_file(client, file_storage, insert_sql, user_table_name) # print(mocked_fragments.mock_calls) @@ -211,9 +214,8 @@ def test_query_split(client: InsertValuesJobClient, file_storage: FileStorage) - # set query length so it reads data until separator ("," or " UNION ALL") (followed by \n) query_length = (idx - start_idx - 1) * 2 - with patch.object(mocked_caps, "max_query_length", query_length), patch.object( - client.sql_client, "execute_fragments" - ) as mocked_fragments: + client.sql_client.capabilities.max_query_length = query_length + with patch.object(client.sql_client, "execute_fragments") as mocked_fragments: user_table_name = prepare_table(client) expect_load_file(client, file_storage, insert_sql, user_table_name) # split in 2 on ',' @@ -221,9 +223,8 @@ def test_query_split(client: InsertValuesJobClient, file_storage: FileStorage) - # so it reads until "\n" query_length = (idx - start_idx) * 2 - with patch.object(mocked_caps, "max_query_length", query_length), patch.object( - client.sql_client, "execute_fragments" - ) as mocked_fragments: + client.sql_client.capabilities.max_query_length = query_length + with patch.object(client.sql_client, "execute_fragments") as mocked_fragments: user_table_name = prepare_table(client) expect_load_file(client, file_storage, insert_sql, user_table_name) # split in 2 on separator ("," or " UNION ALL") @@ -235,9 +236,8 @@ def test_query_split(client: InsertValuesJobClient, file_storage: FileStorage) - elif writer_type == "select_union": offset = 1 query_length = (len(insert_sql) - start_idx - offset) * 2 - with patch.object(mocked_caps, "max_query_length", query_length), patch.object( - client.sql_client, "execute_fragments" - ) as mocked_fragments: + client.sql_client.capabilities.max_query_length = query_length + with patch.object(client.sql_client, "execute_fragments") as mocked_fragments: user_table_name = prepare_table(client) expect_load_file(client, file_storage, insert_sql, user_table_name) # split in 2 on ',' @@ -251,22 +251,21 @@ def assert_load_with_max_query( max_query_length: int, ) -> None: # load and check for real - mocked_caps = client.sql_client.__class__.capabilities - with patch.object(mocked_caps, "max_query_length", max_query_length): - user_table_name = prepare_table(client) - insert_sql = prepare_insert_statement( - insert_lines, client.capabilities.insert_values_writer_type - ) - expect_load_file(client, file_storage, insert_sql, user_table_name) - canonical_name = client.sql_client.make_qualified_table_name(user_table_name) - rows_count = client.sql_client.execute_sql(f"SELECT COUNT(1) FROM {canonical_name}")[0][0] - assert rows_count == insert_lines - # get all uniq ids in order - rows = client.sql_client.execute_sql( - f"SELECT _dlt_id FROM {canonical_name} ORDER BY timestamp ASC;" - ) - v_ids = list(map(lambda i: i[0], rows)) - assert list(map(str, range(0, insert_lines))) == v_ids + client.sql_client.capabilities.max_query_length = max_query_length + user_table_name = prepare_table(client) + insert_sql = prepare_insert_statement( + insert_lines, client.capabilities.insert_values_writer_type + ) + expect_load_file(client, file_storage, insert_sql, user_table_name) + canonical_name = client.sql_client.make_qualified_table_name(user_table_name) + rows_count = client.sql_client.execute_sql(f"SELECT COUNT(1) FROM {canonical_name}")[0][0] + assert rows_count == insert_lines + # get all uniq ids in order + rows = client.sql_client.execute_sql( + f"SELECT _dlt_id FROM {canonical_name} ORDER BY timestamp ASC;" + ) + v_ids = list(map(lambda i: i[0], rows)) + assert list(map(str, range(0, insert_lines))) == v_ids client.sql_client.execute_sql(f"DELETE FROM {canonical_name}") diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 7e360a6664..35b988d46e 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -5,7 +5,7 @@ from unittest.mock import patch import pytest import datetime # noqa: I251 -from typing import Iterator, Tuple, List, Dict, Any, Mapping, MutableMapping +from typing import Iterator, Tuple, List, Dict, Any from dlt.common import json, pendulum from dlt.common.schema import Schema @@ -15,7 +15,7 @@ TWriteDisposition, TTableSchema, ) -from dlt.common.schema.utils import new_table, new_column +from dlt.common.schema.utils import new_table, new_column, pipeline_state_table from dlt.common.storages import FileStorage from dlt.common.schema import TTableSchemaColumns from dlt.common.utils import uniq_id @@ -26,7 +26,7 @@ ) from dlt.destinations.job_client_impl import SqlJobClientBase -from dlt.common.destination.reference import WithStagingDataset +from dlt.common.destination.reference import StateInfo, WithStagingDataset from tests.cases import table_update_and_row, assert_all_data_types_row from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage @@ -41,8 +41,13 @@ cm_yield_client_with_storage, write_dataset, prepare_table, + normalize_storage_table_cols, + destinations_configs, + DestinationTestConfiguration, ) -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential @pytest.fixture @@ -69,13 +74,18 @@ def test_initialize_storage(client: SqlJobClientBase) -> None: ) def test_get_schema_on_empty_storage(client: SqlJobClientBase) -> None: # test getting schema on empty dataset without any tables - exists, _ = client.get_storage_table(VERSION_TABLE_NAME) - assert exists is False + table_name, table_columns = list(client.get_storage_tables([VERSION_TABLE_NAME]))[0] + assert table_name == VERSION_TABLE_NAME + assert len(table_columns) == 0 schema_info = client.get_stored_schema() assert schema_info is None schema_info = client.get_stored_schema_by_hash("8a0298298823928939") assert schema_info is None + # now try to get several non existing tables + storage_tables = list(client.get_storage_tables(["no_table_1", "no_table_2"])) + assert [("no_table_1", {}), ("no_table_2", {})] == storage_tables + @pytest.mark.order(3) @pytest.mark.parametrize( @@ -90,17 +100,17 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: # check is event slot has variant assert schema_update["event_slot"]["columns"]["value"]["variant"] is True # now we have dlt tables - exists, _ = client.get_storage_table(VERSION_TABLE_NAME) - assert exists is True - exists, _ = client.get_storage_table(LOADS_TABLE_NAME) - assert exists is True + storage_tables = list(client.get_storage_tables([VERSION_TABLE_NAME, LOADS_TABLE_NAME])) + assert set([table[0] for table in storage_tables]) == {VERSION_TABLE_NAME, LOADS_TABLE_NAME} + assert [len(table[1]) > 0 for table in storage_tables] == [True, True] # verify if schemas stored this_schema = client.get_stored_schema_by_hash(schema.version_hash) newest_schema = client.get_stored_schema() # should point to the same schema assert this_schema == newest_schema # check fields - assert this_schema.version == 1 == schema.version + # NOTE: schema version == 2 because we updated default hints after loading the schema + assert this_schema.version == 2 == schema.version assert this_schema.version_hash == schema.stored_version_hash assert this_schema.engine_version == schema.ENGINE_VERSION assert this_schema.schema_name == schema.name @@ -120,7 +130,7 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: this_schema = client.get_stored_schema_by_hash(schema.version_hash) newest_schema = client.get_stored_schema() assert this_schema == newest_schema - assert this_schema.version == schema.version == 2 + assert this_schema.version == schema.version == 3 assert this_schema.version_hash == schema.stored_version_hash # simulate parallel write: initial schema is modified differently and written alongside the first one @@ -128,14 +138,14 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: first_schema = Schema.from_dict(json.loads(first_version_schema)) first_schema.tables["event_bot"]["write_disposition"] = "replace" first_schema._bump_version() - assert first_schema.version == this_schema.version == 2 + assert first_schema.version == this_schema.version == 3 # wait to make load_newest_schema deterministic sleep(1) client._update_schema_in_storage(first_schema) this_schema = client.get_stored_schema_by_hash(first_schema.version_hash) newest_schema = client.get_stored_schema() assert this_schema == newest_schema # error - assert this_schema.version == first_schema.version == 2 + assert this_schema.version == first_schema.version == 3 assert this_schema.version_hash == first_schema.stored_version_hash # get schema with non existing hash @@ -157,7 +167,6 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: assert this_schema == newest_schema -@pytest.mark.essential @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) @@ -190,11 +199,11 @@ def test_complete_load(client: SqlJobClientBase) -> None: @pytest.mark.parametrize( "client", - destinations_configs(default_sql_configs=True, subset=["redshift", "postgres", "duckdb"]), + destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name, ) -def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: +def test_schema_update_create_table(client: SqlJobClientBase) -> None: # infer typical rasa event schema schema = client.schema table_name = "event_test_table" + uniq_id() @@ -215,8 +224,8 @@ def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: assert table_update["timestamp"]["sort"] is True assert table_update["sender_id"]["cluster"] is True assert table_update["_dlt_id"]["unique"] is True - exists, _ = client.get_storage_table(table_name) - assert exists is True + _, storage_columns = list(client.get_storage_tables([table_name]))[0] + assert len(storage_columns) > 0 @pytest.mark.parametrize( @@ -225,7 +234,15 @@ def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: indirect=True, ids=lambda x: x.name, ) -def test_schema_update_create_table_bigquery(client: SqlJobClientBase) -> None: +@pytest.mark.parametrize("dataset_name", (None, "_hidden_ds")) +def test_schema_update_create_table_bigquery(client: SqlJobClientBase, dataset_name: str) -> None: + # patch dataset name + if dataset_name: + # drop existing dataset + client.drop_storage() + client.sql_client.dataset_name = dataset_name + "_" + uniq_id() + client.initialize_storage() + # infer typical rasa event schema schema = client.schema # this will be partition @@ -241,14 +258,11 @@ def test_schema_update_create_table_bigquery(client: SqlJobClientBase) -> None: table_update = schema_update["event_test_table"]["columns"] assert table_update["timestamp"]["partition"] is True assert table_update["_dlt_id"]["nullable"] is False - exists, storage_table = client.get_storage_table("event_test_table") - assert exists is True - assert storage_table["timestamp"]["partition"] is True - assert storage_table["sender_id"]["cluster"] is True - exists, storage_table = client.get_storage_table("_dlt_version") - assert exists is True - assert storage_table["version"]["partition"] is False - assert storage_table["version"]["cluster"] is False + _, storage_columns = client.get_storage_table("event_test_table") + # check if all columns present + assert storage_columns.keys() == client.schema.tables["event_test_table"]["columns"].keys() + _, storage_columns = client.get_storage_table("_dlt_version") + assert storage_columns.keys() == client.schema.tables["_dlt_version"]["columns"].keys() @pytest.mark.parametrize( @@ -285,10 +299,11 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: assert len(schema_update[table_name]["columns"]) == 2 assert schema_update[table_name]["columns"]["col3"]["data_type"] == "double" assert schema_update[table_name]["columns"]["col4"]["data_type"] == "timestamp" - _, storage_table = client.get_storage_table(table_name) + _, storage_table_cols = client.get_storage_table(table_name) # 4 columns - assert len(storage_table) == 4 - assert storage_table["col4"]["data_type"] == "timestamp" + assert len(storage_table_cols) == 4 + storage_table_cols = normalize_storage_table_cols(table_name, storage_table_cols, schema) + assert storage_table_cols["col4"]["data_type"] == "timestamp" @pytest.mark.parametrize( @@ -341,9 +356,7 @@ def test_drop_tables(client: SqlJobClientBase) -> None: client.drop_tables(*tables_to_drop, delete_schema=False) # Verify requested tables are dropped - for tbl in tables_to_drop: - exists, _ = client.get_storage_table(tbl) - assert not exists + assert all(len(table[1]) == 0 for table in client.get_storage_tables(tables_to_drop)) # Verify _dlt_version schema is updated and old versions deleted table_name = client.sql_client.make_qualified_table_name(VERSION_TABLE_NAME) @@ -376,14 +389,13 @@ def test_get_storage_table_with_all_types(client: SqlJobClientBase) -> None: for name, column in table_update.items(): assert column.items() >= TABLE_UPDATE_COLUMNS_SCHEMA[name].items() # now get the actual schema from the db - exists, storage_table = client.get_storage_table(table_name) - assert exists is True + _, storage_table = list(client.get_storage_tables([table_name]))[0] + assert len(storage_table) > 0 # column order must match TABLE_UPDATE storage_columns = list(storage_table.values()) for c, expected_c in zip(TABLE_UPDATE, storage_columns): - # print(c["name"]) - # print(c["data_type"]) - assert c["name"] == expected_c["name"] + # storage columns are returned with column names as in information schema + assert client.capabilities.casefold_identifier(c["name"]) == expected_c["name"] # athena does not know wei data type and has no JSON type, time is not supported with parquet tables if client.config.destination_type == "athena" and c["data_type"] in ( "wei", @@ -429,8 +441,7 @@ def _assert_columns_order(sql_: str) -> None: if hasattr(client.sql_client, "escape_ddl_identifier"): col_name = client.sql_client.escape_ddl_identifier(c["name"]) else: - col_name = client.capabilities.escape_identifier(c["name"]) - print(col_name) + col_name = client.sql_client.escape_column_name(c["name"]) # find column names idx = sql_.find(col_name, idx) assert idx > 0, f"column {col_name} not found in script" @@ -716,6 +727,53 @@ def test_default_schema_name_init_storage(destination_config: DestinationTestCon assert client.sql_client.has_dataset() +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +@pytest.mark.parametrize( + "naming_convention", + [ + "tests.common.cases.normalizers.title_case", + "snake_case", + ], +) +def test_get_stored_state( + destination_config: DestinationTestConfiguration, + naming_convention: str, + file_storage: FileStorage, +) -> None: + os.environ["SCHEMA__NAMING"] = naming_convention + + with cm_yield_client_with_storage( + destination_config.destination, default_config_values={"default_schema_name": None} + ) as client: + # event schema with event table + if not client.capabilities.preferred_loader_file_format: + pytest.skip( + "preferred loader file format not set, destination will only work with staging" + ) + # load pipeline state + state_table = pipeline_state_table() + partial = client.schema.update_table(state_table) + print(partial) + client.schema._bump_version() + client.update_stored_schema() + + state_info = StateInfo(1, 4, "pipeline", "compressed", pendulum.now(), None, "_load_id") + doc = state_info.as_doc() + norm_doc = {client.schema.naming.normalize_identifier(k): v for k, v in doc.items()} + with io.BytesIO() as f: + # use normalized columns + write_dataset(client, f, [norm_doc], partial["columns"]) + query = f.getvalue().decode() + expect_load_file(client, file_storage, query, partial["name"]) + client.complete_load("_load_id") + + # get state + stored_state = client.get_stored_state("pipeline") + assert doc == stored_state.as_doc() + + @pytest.mark.parametrize( "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name ) diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 26d7884179..fa31f1db65 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -22,8 +22,15 @@ from dlt.common.time import ensure_pendulum_datetime from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage -from tests.load.utils import yield_client_with_storage, prepare_table, AWS_BUCKET -from tests.load.pipeline.utils import destinations_configs +from tests.load.utils import ( + yield_client_with_storage, + prepare_table, + AWS_BUCKET, + destinations_configs, +) + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential @pytest.fixture @@ -141,7 +148,6 @@ def test_malformed_execute_parameters(client: SqlJobClientBase) -> None: assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) -@pytest.mark.essential @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) @@ -189,7 +195,6 @@ def test_execute_sql(client: SqlJobClientBase) -> None: assert len(rows) == 0 -@pytest.mark.essential @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) @@ -212,7 +217,6 @@ def test_execute_ddl(client: SqlJobClientBase) -> None: assert rows[0][0] == Decimal("1.0") -@pytest.mark.essential @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) @@ -255,7 +259,6 @@ def test_execute_query(client: SqlJobClientBase) -> None: assert len(rows) == 0 -@pytest.mark.essential @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) @@ -307,7 +310,6 @@ def test_execute_df(client: SqlJobClientBase) -> None: assert df_3 is None -@pytest.mark.essential @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) @@ -490,7 +492,7 @@ def test_transaction_isolation(client: SqlJobClientBase) -> None: def test_thread(thread_id: Decimal) -> None: # make a copy of the sql_client thread_client = client.sql_client.__class__( - client.sql_client.dataset_name, client.sql_client.credentials + client.sql_client.dataset_name, client.sql_client.credentials, client.capabilities ) with thread_client: with thread_client.begin_transaction(): diff --git a/tests/load/utils.py b/tests/load/utils.py index 8048d9fe51..8c6446b921 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -24,13 +24,15 @@ from dlt.common.destination import TLoaderFileFormat, Destination from dlt.common.destination.reference import DEFAULT_FILE_LAYOUT from dlt.common.data_writers import DataWriter +from dlt.common.pipeline import PipelineContext from dlt.common.schema import TTableSchemaColumns, Schema from dlt.common.storages import SchemaStorage, FileStorage, SchemaStorageConfiguration -from dlt.common.schema.utils import new_table +from dlt.common.schema.utils import new_table, normalize_table_identifiers from dlt.common.storages import ParsedLoadJobFileName, LoadStorage, PackageStorage from dlt.common.typing import StrAny from dlt.common.utils import uniq_id +from dlt.destinations.exceptions import CantExtractTablePrefix from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase @@ -126,6 +128,7 @@ class DestinationTestConfiguration: force_iceberg: bool = False supports_dbt: bool = True disable_compression: bool = False + dev_mode: bool = False @property def name(self) -> str: @@ -140,15 +143,26 @@ def name(self) -> str: name += f"-{self.extra_info}" return name + @property + def factory_kwargs(self) -> Dict[str, Any]: + return { + k: getattr(self, k) + for k in [ + "bucket_url", + "stage_name", + "staging_iam_role", + "staging_use_msi", + "force_iceberg", + ] + if getattr(self, k, None) is not None + } + def setup(self) -> None: """Sets up environment variables for this destination configuration""" - os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = self.bucket_url or "" - os.environ["DESTINATION__STAGE_NAME"] = self.stage_name or "" - os.environ["DESTINATION__STAGING_IAM_ROLE"] = self.staging_iam_role or "" - os.environ["DESTINATION__STAGING_USE_MSI"] = str(self.staging_use_msi) or "" - os.environ["DESTINATION__FORCE_ICEBERG"] = str(self.force_iceberg) or "" + for k, v in self.factory_kwargs.items(): + os.environ[f"DESTINATION__{k.upper()}"] = str(v) - """For the filesystem destinations we disable compression to make analyzing the result easier""" + # For the filesystem destinations we disable compression to make analyzing the result easier if self.destination == "filesystem" or self.disable_compression: os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" @@ -156,17 +170,24 @@ def setup_pipeline( self, pipeline_name: str, dataset_name: str = None, dev_mode: bool = False, **kwargs ) -> dlt.Pipeline: """Convenience method to setup pipeline with this configuration""" + self.dev_mode = dev_mode self.setup() pipeline = dlt.pipeline( pipeline_name=pipeline_name, - destination=self.destination, - staging=self.staging, + destination=kwargs.pop("destination", self.destination), + staging=kwargs.pop("staging", self.staging), dataset_name=dataset_name or pipeline_name, dev_mode=dev_mode, **kwargs, ) return pipeline + def attach_pipeline(self, pipeline_name: str, **kwargs) -> dlt.Pipeline: + """Attach to existing pipeline keeping the dev_mode""" + # remember dev_mode from setup_pipeline + pipeline = dlt.attach(pipeline_name, **kwargs) + return pipeline + def destinations_configs( default_sql_configs: bool = False, @@ -489,6 +510,60 @@ def destinations_configs( return destination_configs +@pytest.fixture(autouse=True) +def drop_pipeline(request, preserve_environ) -> Iterator[None]: + # NOTE: keep `preserve_environ` to make sure fixtures are executed in order`` + yield + if "no_load" in request.keywords: + return + try: + drop_active_pipeline_data() + except CantExtractTablePrefix: + # for some tests we test that this exception is raised, + # so we suppress it here + pass + + +def drop_active_pipeline_data() -> None: + """Drops all the datasets for currently active pipeline, wipes the working folder and then deactivated it.""" + if Container()[PipelineContext].is_active(): + try: + # take existing pipeline + p = dlt.pipeline() + + def _drop_dataset(schema_name: str) -> None: + with p.destination_client(schema_name) as client: + try: + client.drop_storage() + print("dropped") + except Exception as exc: + print(exc) + if isinstance(client, WithStagingDataset): + with client.with_staging_dataset(): + try: + client.drop_storage() + print("staging dropped") + except Exception as exc: + print(exc) + + # drop_func = _drop_dataset_fs if _is_filesystem(p) else _drop_dataset_sql + # take all schemas and if destination was set + if p.destination: + if p.config.use_single_dataset: + # drop just the dataset for default schema + if p.default_schema_name: + _drop_dataset(p.default_schema_name) + else: + # for each schema, drop the dataset + for schema_name in p.schema_names: + _drop_dataset(schema_name) + + # p._wipe_working_folder() + finally: + # always deactivate context, working directory will be wiped when the next test starts + Container()[PipelineContext].deactivate() + + @pytest.fixture def empty_schema() -> Schema: schema = Schema("event") @@ -580,6 +655,9 @@ def yield_client( ) schema_storage = SchemaStorage(storage_config) schema = schema_storage.load_schema(schema_name) + schema.update_normalizers() + # NOTE: schema version is bumped because new default hints are added + schema._bump_version() # create client and dataset client: SqlJobClientBase = None @@ -680,7 +758,7 @@ def prepare_load_package( shutil.copy( path, load_storage.new_packages.storage.make_full_path( - load_storage.new_packages.get_job_folder_path(load_id, "new_jobs") + load_storage.new_packages.get_job_state_folder_path(load_id, "new_jobs") ), ) schema_path = Path("./tests/load/cases/loading/schema.json") @@ -708,3 +786,15 @@ def sequence_generator() -> Generator[List[Dict[str, str]], None, None]: while True: yield [{"content": str(count + i)} for i in range(3)] count += 3 + + +def normalize_storage_table_cols( + table_name: str, cols: TTableSchemaColumns, schema: Schema +) -> TTableSchemaColumns: + """Normalize storage table columns back into schema naming""" + # go back to schema naming convention. this is a hack - will work here to + # reverse snowflake UPPER case folding + storage_table = normalize_table_identifiers( + new_table(table_name, columns=cols.values()), schema.naming # type: ignore[arg-type] + ) + return storage_table["columns"] diff --git a/tests/load/weaviate/test_pipeline.py b/tests/load/weaviate/test_pipeline.py index ee42ab59d8..fc46d00d05 100644 --- a/tests/load/weaviate/test_pipeline.py +++ b/tests/load/weaviate/test_pipeline.py @@ -4,9 +4,13 @@ import dlt from dlt.common import json +from dlt.common.schema.exceptions import ( + SchemaCorruptedException, + SchemaIdentifierNormalizationCollision, +) from dlt.common.utils import uniq_id -from dlt.destinations.impl.weaviate import weaviate_adapter +from dlt.destinations.adapters import weaviate_adapter from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict from dlt.destinations.impl.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient @@ -244,7 +248,8 @@ def movies_data(): assert_class(pipeline, "MoviesData", items=data) -def test_pipeline_with_schema_evolution(): +@pytest.mark.parametrize("vectorized", (True, False), ids=("vectorized", "not-vectorized")) +def test_pipeline_with_schema_evolution(vectorized: bool): data = [ { "doc_id": 1, @@ -260,7 +265,8 @@ def test_pipeline_with_schema_evolution(): def some_data(): yield data - weaviate_adapter(some_data, vectorize=["content"]) + if vectorized: + weaviate_adapter(some_data, vectorize=["content"]) pipeline = dlt.pipeline( pipeline_name="test_pipeline_append", @@ -280,17 +286,22 @@ def some_data(): "doc_id": 3, "content": "3", "new_column": "new", + "new_vec_column": "lorem lorem", }, { "doc_id": 4, "content": "4", "new_column": "new", + "new_vec_column": "lorem lorem", }, ] - pipeline.run( - some_data(), - ) + some_data_2 = some_data() + + if vectorized: + weaviate_adapter(some_data_2, vectorize=["new_vec_column"]) + + pipeline.run(some_data_2) table_schema = pipeline.default_schema.tables["SomeData"] assert "new_column" in table_schema["columns"] @@ -298,6 +309,8 @@ def some_data(): aggregated_data.extend(data) aggregated_data[0]["new_column"] = None aggregated_data[1]["new_column"] = None + aggregated_data[0]["new_vec_column"] = None + aggregated_data[1]["new_vec_column"] = None assert_class(pipeline, "SomeData", items=aggregated_data) @@ -391,7 +404,7 @@ def test_vectorize_property_without_data() -> None: primary_key="vAlue", columns={"vAlue": {"data_type": "text"}}, ) - assert isinstance(pipe_ex.value.__context__, PropertyNameConflict) + assert isinstance(pipe_ex.value.__context__, SchemaIdentifierNormalizationCollision) # set the naming convention to case insensitive os.environ["SCHEMA__NAMING"] = "dlt.destinations.impl.weaviate.ci_naming" diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index 8c3344f152..dc2110d2f6 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -5,6 +5,7 @@ from dlt.common.schema import Schema from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.schema.exceptions import SchemaIdentifierNormalizationCollision from dlt.common.utils import uniq_id from dlt.common.schema.typing import TWriteDisposition, TColumnSchema, TTableSchemaColumns @@ -13,7 +14,7 @@ from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient from dlt.common.storages.file_storage import FileStorage -from dlt.common.schema.utils import new_table +from dlt.common.schema.utils import new_table, normalize_table_identifiers from tests.load.utils import ( TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE, @@ -58,11 +59,11 @@ def make_client(naming_convention: str) -> Iterator[WeaviateClient]: "test_schema", {"names": f"dlt.destinations.impl.weaviate.{naming_convention}", "json": None}, ) - _client = get_client_instance(schema) - try: - yield _client - finally: - _client.drop_storage() + with get_client_instance(schema) as _client: + try: + yield _client + finally: + _client.drop_storage() @pytest.fixture @@ -114,11 +115,18 @@ def test_case_sensitive_properties_create(client: WeaviateClient) -> None: {"name": "coL1", "data_type": "double", "nullable": False}, ] client.schema.update_table( - client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) + normalize_table_identifiers( + new_table(class_name, columns=table_create), client.schema.naming + ) ) client.schema._bump_version() - with pytest.raises(PropertyNameConflict): + with pytest.raises(SchemaIdentifierNormalizationCollision) as clash_ex: client.update_stored_schema() + assert clash_ex.value.identifier_type == "column" + assert clash_ex.value.identifier_name == "coL1" + assert clash_ex.value.conflict_identifier_name == "col1" + assert clash_ex.value.table_name == "ColClass" + assert clash_ex.value.naming_name == "dlt.destinations.impl.weaviate.naming" def test_case_insensitive_properties_create(ci_client: WeaviateClient) -> None: @@ -129,7 +137,9 @@ def test_case_insensitive_properties_create(ci_client: WeaviateClient) -> None: {"name": "coL1", "data_type": "double", "nullable": False}, ] ci_client.schema.update_table( - ci_client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) + normalize_table_identifiers( + new_table(class_name, columns=table_create), ci_client.schema.naming + ) ) ci_client.schema._bump_version() ci_client.update_stored_schema() @@ -146,16 +156,20 @@ def test_case_sensitive_properties_add(client: WeaviateClient) -> None: {"name": "coL1", "data_type": "double", "nullable": False}, ] client.schema.update_table( - client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) + normalize_table_identifiers( + new_table(class_name, columns=table_create), client.schema.naming + ) ) client.schema._bump_version() client.update_stored_schema() client.schema.update_table( - client.schema.normalize_table_identifiers(new_table(class_name, columns=table_update)) + normalize_table_identifiers( + new_table(class_name, columns=table_update), client.schema.naming + ) ) client.schema._bump_version() - with pytest.raises(PropertyNameConflict): + with pytest.raises(SchemaIdentifierNormalizationCollision): client.update_stored_schema() # _, table_columns = client.get_storage_table("ColClass") @@ -171,12 +185,13 @@ def test_load_case_sensitive_data(client: WeaviateClient, file_storage: FileStor client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) client.schema._bump_version() client.update_stored_schema() - # prepare a data item where is name clash due to Weaviate being CI + # prepare a data item where is name clash due to Weaviate being CS data_clash = {"col1": 72187328, "coL1": 726171} # write row with io.BytesIO() as f: write_dataset(client, f, [data_clash], table_create) query = f.getvalue().decode() + class_name = client.schema.naming.normalize_table_identifier(class_name) with pytest.raises(PropertyNameConflict): expect_load_file(client, file_storage, query, class_name) @@ -202,6 +217,7 @@ def test_load_case_sensitive_data_ci(ci_client: WeaviateClient, file_storage: Fi with io.BytesIO() as f: write_dataset(ci_client, f, [data_clash], table_create) query = f.getvalue().decode() + class_name = ci_client.schema.naming.normalize_table_identifier(class_name) expect_load_file(ci_client, file_storage, query, class_name) response = ci_client.query_class(class_name, ["col1"]).do() objects = response["data"]["Get"][ci_client.make_qualified_class_name(class_name)] diff --git a/tests/load/weaviate/utils.py b/tests/load/weaviate/utils.py index 1b2a74fcb8..b391c2fa38 100644 --- a/tests/load/weaviate/utils.py +++ b/tests/load/weaviate/utils.py @@ -22,53 +22,57 @@ def assert_class( expected_items_count: int = None, items: List[Any] = None, ) -> None: - client: WeaviateClient = pipeline.destination_client() # type: ignore[assignment] - vectorizer_name: str = client._vectorizer_config["vectorizer"] # type: ignore[assignment] - - # Check if class exists - schema = client.get_class_schema(class_name) - assert schema is not None - - columns = pipeline.default_schema.get_table_columns(class_name) - - properties = {prop["name"]: prop for prop in schema["properties"]} - assert set(properties.keys()) == set(columns.keys()) - - # make sure expected columns are vectorized - for column_name, column in columns.items(): - prop = properties[column_name] - assert prop["moduleConfig"][vectorizer_name]["skip"] == ( - not column.get(VECTORIZE_HINT, False) - ) - # tokenization - if TOKENIZATION_HINT in column: - assert prop["tokenization"] == column[TOKENIZATION_HINT] # type: ignore[literal-required] - - # if there's a single vectorize hint, class must have vectorizer enabled - if get_columns_names_with_prop(pipeline.default_schema.get_table(class_name), VECTORIZE_HINT): - assert schema["vectorizer"] == vectorizer_name - else: - assert schema["vectorizer"] == "none" - - # response = db_client.query.get(class_name, list(properties.keys())).do() - response = client.query_class(class_name, list(properties.keys())).do() - objects = response["data"]["Get"][client.make_qualified_class_name(class_name)] - - if expected_items_count is not None: - assert expected_items_count == len(objects) - - if items is None: - return - - # TODO: Remove this once we have a better way comparing the data - drop_keys = ["_dlt_id", "_dlt_load_id"] - objects_without_dlt_keys = [ - {k: v for k, v in obj.items() if k not in drop_keys} for obj in objects - ] - - # pytest compares content wise but ignores order of elements of dict - # assert sorted(objects_without_dlt_keys, key=lambda d: d['doc_id']) == sorted(data, key=lambda d: d['doc_id']) - assert_unordered_list_equal(objects_without_dlt_keys, items) + client: WeaviateClient + with pipeline.destination_client() as client: # type: ignore[assignment] + vectorizer_name: str = client._vectorizer_config["vectorizer"] # type: ignore[assignment] + + # Check if class exists + schema = client.get_class_schema(class_name) + assert schema is not None + + columns = pipeline.default_schema.get_table_columns(class_name) + + properties = {prop["name"]: prop for prop in schema["properties"]} + assert set(properties.keys()) == set(columns.keys()) + + # make sure expected columns are vectorized + for column_name, column in columns.items(): + prop = properties[column_name] + if client._is_collection_vectorized(class_name): + assert prop["moduleConfig"][vectorizer_name]["skip"] == ( + not column.get(VECTORIZE_HINT, False) + ) + # tokenization + if TOKENIZATION_HINT in column: + assert prop["tokenization"] == column[TOKENIZATION_HINT] # type: ignore[literal-required] + + # if there's a single vectorize hint, class must have vectorizer enabled + if get_columns_names_with_prop( + pipeline.default_schema.get_table(class_name), VECTORIZE_HINT + ): + assert schema["vectorizer"] == vectorizer_name + else: + assert schema["vectorizer"] == "none" + + # response = db_client.query.get(class_name, list(properties.keys())).do() + response = client.query_class(class_name, list(properties.keys())).do() + objects = response["data"]["Get"][client.make_qualified_class_name(class_name)] + + if expected_items_count is not None: + assert expected_items_count == len(objects) + + if items is None: + return + + # TODO: Remove this once we have a better way comparing the data + drop_keys = ["_dlt_id", "_dlt_load_id"] + objects_without_dlt_keys = [ + {k: v for k, v in obj.items() if k not in drop_keys} for obj in objects + ] + + # pytest compares content wise but ignores order of elements of dict + # assert sorted(objects_without_dlt_keys, key=lambda d: d['doc_id']) == sorted(data, key=lambda d: d['doc_id']) + assert_unordered_list_equal(objects_without_dlt_keys, items) def delete_classes(p, class_list): @@ -87,10 +91,9 @@ def schema_has_classes(client): if Container()[PipelineContext].is_active(): # take existing pipeline p = dlt.pipeline() - client = p.destination_client() - - if schema_has_classes(client): - client.drop_storage() + with p.destination_client() as client: + if schema_has_classes(client): + client.drop_storage() p._wipe_working_folder() # deactivate context diff --git a/tests/normalize/test_max_nesting.py b/tests/normalize/test_max_nesting.py index 4015836232..5def1617dc 100644 --- a/tests/normalize/test_max_nesting.py +++ b/tests/normalize/test_max_nesting.py @@ -62,7 +62,7 @@ def bot_events(): pipeline = dlt.pipeline( pipeline_name=pipeline_name, destination=dummy(timeout=0.1), - full_refresh=True, + dev_mode=True, ) pipeline.run(bot_events) @@ -169,7 +169,7 @@ def some_data(): pipeline = dlt.pipeline( pipeline_name=pipeline_name, destination=dummy(timeout=0.1), - full_refresh=True, + dev_mode=True, ) pipeline.run(some_data(), write_disposition="append") diff --git a/tests/normalize/test_normalize.py b/tests/normalize/test_normalize.py index 3891c667c3..7463184be7 100644 --- a/tests/normalize/test_normalize.py +++ b/tests/normalize/test_normalize.py @@ -16,6 +16,7 @@ from dlt.extract.extract import ExtractStorage from dlt.normalize import Normalize +from dlt.normalize.worker import group_worker_files from dlt.normalize.exceptions import NormalizeJobFailed from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_TYPES @@ -510,28 +511,28 @@ def test_collect_metrics_on_exception(raw_normalize: Normalize) -> None: def test_group_worker_files() -> None: files = ["f%03d" % idx for idx in range(0, 100)] - assert Normalize.group_worker_files([], 4) == [] - assert Normalize.group_worker_files(["f001"], 1) == [["f001"]] - assert Normalize.group_worker_files(["f001"], 100) == [["f001"]] - assert Normalize.group_worker_files(files[:4], 4) == [["f000"], ["f001"], ["f002"], ["f003"]] - assert Normalize.group_worker_files(files[:5], 4) == [ + assert group_worker_files([], 4) == [] + assert group_worker_files(["f001"], 1) == [["f001"]] + assert group_worker_files(["f001"], 100) == [["f001"]] + assert group_worker_files(files[:4], 4) == [["f000"], ["f001"], ["f002"], ["f003"]] + assert group_worker_files(files[:5], 4) == [ ["f000"], ["f001"], ["f002"], ["f003", "f004"], ] - assert Normalize.group_worker_files(files[:8], 4) == [ + assert group_worker_files(files[:8], 4) == [ ["f000", "f001"], ["f002", "f003"], ["f004", "f005"], ["f006", "f007"], ] - assert Normalize.group_worker_files(files[:8], 3) == [ + assert group_worker_files(files[:8], 3) == [ ["f000", "f001"], ["f002", "f003", "f006"], ["f004", "f005", "f007"], ] - assert Normalize.group_worker_files(files[:5], 3) == [ + assert group_worker_files(files[:5], 3) == [ ["f000"], ["f001", "f003"], ["f002", "f004"], @@ -539,7 +540,7 @@ def test_group_worker_files() -> None: # check if sorted files = ["tab1.1", "chd.3", "tab1.2", "chd.4", "tab1.3"] - assert Normalize.group_worker_files(files, 3) == [ + assert group_worker_files(files, 3) == [ ["chd.3"], ["chd.4", "tab1.2"], ["tab1.1", "tab1.3"], @@ -730,19 +731,22 @@ def test_removal_of_normalizer_schema_section_and_add_seen_data(raw_normalize: N extracted_schema.tables["event__random_table"] = new_table("event__random_table") # add x-normalizer info (and other block to control) - extracted_schema.tables["event"]["x-normalizer"] = {"evolve-columns-once": True} # type: ignore + extracted_schema.tables["event"]["x-normalizer"] = {"evolve-columns-once": True} extracted_schema.tables["event"]["x-other-info"] = "blah" # type: ignore - extracted_schema.tables["event__parse_data__intent_ranking"]["x-normalizer"] = {"seen-data": True, "random-entry": 1234} # type: ignore - extracted_schema.tables["event__random_table"]["x-normalizer"] = {"evolve-columns-once": True} # type: ignore + extracted_schema.tables["event__parse_data__intent_ranking"]["x-normalizer"] = { + "seen-data": True, + "random-entry": 1234, + } + extracted_schema.tables["event__random_table"]["x-normalizer"] = {"evolve-columns-once": True} normalize_pending(raw_normalize, extracted_schema) schema = raw_normalize.schema_storage.load_schema("event") # seen data gets added, schema settings get removed - assert schema.tables["event"]["x-normalizer"] == {"seen-data": True} # type: ignore - assert schema.tables["event__parse_data__intent_ranking"]["x-normalizer"] == { # type: ignore + assert schema.tables["event"]["x-normalizer"] == {"seen-data": True} + assert schema.tables["event__parse_data__intent_ranking"]["x-normalizer"] == { "seen-data": True, "random-entry": 1234, } # no data seen here, so seen-data is not set and evolve settings stays until first data is seen - assert schema.tables["event__random_table"]["x-normalizer"] == {"evolve-columns-once": True} # type: ignore + assert schema.tables["event__random_table"]["x-normalizer"] == {"evolve-columns-once": True} assert "x-other-info" in schema.tables["event"] diff --git a/tests/normalize/utils.py b/tests/normalize/utils.py index 0ce099d4b6..dffb3f1bb6 100644 --- a/tests/normalize/utils.py +++ b/tests/normalize/utils.py @@ -1,15 +1,10 @@ -from typing import Mapping, cast +from dlt.destinations import duckdb, redshift, postgres, bigquery, filesystem -from dlt.destinations.impl.duckdb import capabilities as duck_insert_caps -from dlt.destinations.impl.redshift import capabilities as rd_insert_caps -from dlt.destinations.impl.postgres import capabilities as pg_insert_caps -from dlt.destinations.impl.bigquery import capabilities as jsonl_caps -from dlt.destinations.impl.filesystem import capabilities as filesystem_caps - -DEFAULT_CAPS = pg_insert_caps -INSERT_CAPS = [duck_insert_caps, rd_insert_caps, pg_insert_caps] -JSONL_CAPS = [jsonl_caps, filesystem_caps] +# callables to capabilities +DEFAULT_CAPS = postgres().capabilities +INSERT_CAPS = [duckdb().capabilities, redshift().capabilities, DEFAULT_CAPS] +JSONL_CAPS = [bigquery().capabilities, filesystem().capabilities] ALL_CAPABILITIES = INSERT_CAPS + JSONL_CAPS diff --git a/tests/pipeline/cases/github_pipeline/github_pipeline.py b/tests/pipeline/cases/github_pipeline/github_pipeline.py index aa0f6d0e0e..f4cdc2bcf2 100644 --- a/tests/pipeline/cases/github_pipeline/github_pipeline.py +++ b/tests/pipeline/cases/github_pipeline/github_pipeline.py @@ -33,11 +33,21 @@ def load_issues( if __name__ == "__main__": - p = dlt.pipeline("dlt_github_pipeline", destination="duckdb", dataset_name="github_3") + # pick the destination name + if len(sys.argv) < 1: + raise RuntimeError(f"Please provide destination name in args ({sys.argv})") + dest_ = sys.argv[1] + if dest_ == "filesystem": + import os + from dlt.destinations import filesystem + + dest_ = filesystem(os.path.abspath(os.path.join("_storage", "data"))) # type: ignore + + p = dlt.pipeline("dlt_github_pipeline", destination=dest_, dataset_name="github_3") github_source = github() - if len(sys.argv) > 1: + if len(sys.argv) > 2: # load only N issues - limit = int(sys.argv[1]) + limit = int(sys.argv[2]) github_source.add_limit(limit) info = p.run(github_source) print(info) diff --git a/tests/pipeline/test_arrow_sources.py b/tests/pipeline/test_arrow_sources.py index 0c03a8209d..4cdccb1e34 100644 --- a/tests/pipeline/test_arrow_sources.py +++ b/tests/pipeline/test_arrow_sources.py @@ -9,7 +9,11 @@ import dlt from dlt.common import json, Decimal from dlt.common.utils import uniq_id -from dlt.common.libs.pyarrow import NameNormalizationClash, remove_columns, normalize_py_arrow_item +from dlt.common.libs.pyarrow import ( + NameNormalizationCollision, + remove_columns, + normalize_py_arrow_item, +) from dlt.pipeline.exceptions import PipelineStepFailed @@ -17,8 +21,8 @@ arrow_table_all_data_types, prepare_shuffled_tables, ) +from tests.pipeline.utils import assert_only_table_columns, load_tables_to_dicts from tests.utils import ( - preserve_environ, TPythonTableFormat, arrow_item_from_pandas, arrow_item_from_table, @@ -223,7 +227,7 @@ def data_frames(): with pytest.raises(PipelineStepFailed) as py_ex: pipeline.extract(data_frames()) - assert isinstance(py_ex.value.__context__, NameNormalizationClash) + assert isinstance(py_ex.value.__context__, NameNormalizationCollision) @pytest.mark.parametrize("item_type", ["arrow-table", "arrow-batch"]) @@ -507,6 +511,48 @@ def test_empty_arrow(item_type: TPythonTableFormat) -> None: assert norm_info.row_counts["items"] == 0 +def test_import_file_with_arrow_schema() -> None: + pipeline = dlt.pipeline( + pipeline_name="test_jsonl_import", + destination="duckdb", + dev_mode=True, + ) + + # Define the schema based on the CSV input + schema = pa.schema( + [ + ("id", pa.int64()), + ("name", pa.string()), + ("description", pa.string()), + ("ordered_at", pa.date32()), + ("price", pa.float64()), + ] + ) + + # Create empty arrays for each field + empty_arrays = [ + pa.array([], type=pa.int64()), + pa.array([], type=pa.string()), + pa.array([], type=pa.string()), + pa.array([], type=pa.date32()), + pa.array([], type=pa.float64()), + ] + + # Create an empty table with the defined schema + empty_table = pa.Table.from_arrays(empty_arrays, schema=schema) + + # columns should be created from empty table + import_file = "tests/load/cases/loading/header.jsonl" + info = pipeline.run( + [dlt.mark.with_file_import(import_file, "jsonl", 2, hints=empty_table)], + table_name="no_header", + ) + info.raise_on_failed_jobs() + assert_only_table_columns(pipeline, "no_header", schema.names) + rows = load_tables_to_dicts(pipeline, "no_header") + assert len(rows["no_header"]) == 2 + + @pytest.mark.parametrize("item_type", ["pandas", "arrow-table", "arrow-batch"]) def test_extract_adds_dlt_load_id(item_type: TPythonTableFormat) -> None: os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID"] = "True" diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index ccf926cc62..ba7c0b9db8 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -1,4 +1,5 @@ import sys +from subprocess import CalledProcessError import pytest import tempfile import shutil @@ -14,17 +15,19 @@ from dlt.common.storages import FileStorage from dlt.common.schema.typing import ( LOADS_TABLE_NAME, - STATE_TABLE_NAME, + PIPELINE_STATE_TABLE_NAME, VERSION_TABLE_NAME, TStoredSchema, ) from dlt.common.configuration.resolve import resolve_configuration +from dlt.destinations import duckdb, filesystem from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient +from tests.pipeline.utils import load_table_counts from tests.utils import TEST_STORAGE_ROOT, test_storage -if sys.version_info > (3, 11): +if sys.version_info >= (3, 12): pytest.skip("Does not run on Python 3.12 and later", allow_module_level=True) @@ -50,7 +53,9 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: # load 20 issues print( venv.run_script( - "../tests/pipeline/cases/github_pipeline/github_pipeline.py", "20" + "../tests/pipeline/cases/github_pipeline/github_pipeline.py", + "duckdb", + "20", ) ) # load schema and check _dlt_loads definition @@ -66,20 +71,23 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: ) # check the dlt state table assert { - "version_hash" not in github_schema["tables"][STATE_TABLE_NAME]["columns"] + "version_hash" + not in github_schema["tables"][PIPELINE_STATE_TABLE_NAME]["columns"] } # check loads table without attaching to pipeline duckdb_cfg = resolve_configuration( DuckDbClientConfiguration()._bind_dataset_name(dataset_name=GITHUB_DATASET), sections=("destination", "duckdb"), ) - with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: + with DuckDbSqlClient( + GITHUB_DATASET, duckdb_cfg.credentials, duckdb().capabilities() + ) as client: rows = client.execute_sql(f"SELECT * FROM {LOADS_TABLE_NAME}") # make sure we have just 4 columns assert len(rows[0]) == 4 rows = client.execute_sql("SELECT * FROM issues") assert len(rows) == 20 - rows = client.execute_sql(f"SELECT * FROM {STATE_TABLE_NAME}") + rows = client.execute_sql(f"SELECT * FROM {PIPELINE_STATE_TABLE_NAME}") # only 5 columns + 2 dlt columns assert len(rows[0]) == 5 + 2 # inspect old state @@ -99,7 +107,16 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: # execute in current version venv = Venv.restore_current() # load all issues - print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_pipeline.py")) + try: + print( + venv.run_script( + "../tests/pipeline/cases/github_pipeline/github_pipeline.py", "duckdb" + ) + ) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise # hash hash in schema github_schema = json.loads( test_storage.load( @@ -108,13 +125,16 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: ) assert github_schema["engine_version"] == 9 assert "schema_version_hash" in github_schema["tables"][LOADS_TABLE_NAME]["columns"] + # print(github_schema["tables"][PIPELINE_STATE_TABLE_NAME]) # load state state_dict = json.loads( test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/state.json") ) assert "_version_hash" in state_dict - with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: + with DuckDbSqlClient( + GITHUB_DATASET, duckdb_cfg.credentials, duckdb().capabilities() + ) as client: rows = client.execute_sql( f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at" ) @@ -131,7 +151,9 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: # two schema versions rows = client.execute_sql(f"SELECT * FROM {VERSION_TABLE_NAME}") assert len(rows) == 2 - rows = client.execute_sql(f"SELECT * FROM {STATE_TABLE_NAME} ORDER BY version") + rows = client.execute_sql( + f"SELECT * FROM {PIPELINE_STATE_TABLE_NAME} ORDER BY version" + ) # we have hash columns assert len(rows[0]) == 6 + 2 assert len(rows) == 2 @@ -141,23 +163,82 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: assert rows[1][7] == state_dict["_version_hash"] # attach to existing pipeline - pipeline = dlt.attach(GITHUB_PIPELINE_NAME, credentials=duckdb_cfg.credentials) - created_at_value = pipeline.state["sources"]["github"]["resources"]["load_issues"][ - "incremental" - ]["created_at"]["last_value"] - assert isinstance(created_at_value, pendulum.DateTime) - assert created_at_value == pendulum.parse("2023-02-17T09:52:12Z") - pipeline = pipeline.drop() - # print(pipeline.working_dir) - assert pipeline.dataset_name == GITHUB_DATASET - assert pipeline.default_schema_name is None - # sync from destination - pipeline.sync_destination() - # print(pipeline.working_dir) - # we have updated schema - assert pipeline.default_schema.ENGINE_VERSION == 9 - # make sure that schema hash retrieved from the destination is exactly the same as the schema hash that was in storage before the schema was wiped - assert pipeline.default_schema.stored_version_hash == github_schema["version_hash"] + pipeline = dlt.attach( + GITHUB_PIPELINE_NAME, destination=duckdb(credentials=duckdb_cfg.credentials) + ) + assert_github_pipeline_end_state(pipeline, github_schema, 2) + + +def test_filesystem_pipeline_with_dlt_update(test_storage: FileStorage) -> None: + shutil.copytree("tests/pipeline/cases/github_pipeline", TEST_STORAGE_ROOT, dirs_exist_ok=True) + + # execute in test storage + with set_working_dir(TEST_STORAGE_ROOT): + # store dlt data in test storage (like patch_home_dir) + with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): + # create virtual env with (0.4.9) where filesystem started to store state + with Venv.create(tempfile.mkdtemp(), ["dlt==0.4.9"]) as venv: + try: + print(venv.run_script("github_pipeline.py", "filesystem", "20")) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise + # load all issues + venv = Venv.restore_current() + try: + print(venv.run_script("github_pipeline.py", "filesystem")) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise + # hash hash in schema + github_schema = json.loads( + test_storage.load( + f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json" + ) + ) + # attach to existing pipeline + pipeline = dlt.attach(GITHUB_PIPELINE_NAME, destination=filesystem("_storage/data")) + # assert end state + assert_github_pipeline_end_state(pipeline, github_schema, 2) + # load new state + fs_client = pipeline._fs_client() + state_files = sorted(fs_client.list_table_files("_dlt_pipeline_state")) + # first file is in old format + state_1 = json.loads(fs_client.read_text(state_files[0], encoding="utf-8")) + assert "dlt_load_id" in state_1 + # seconds is new + state_2 = json.loads(fs_client.read_text(state_files[1], encoding="utf-8")) + assert "_dlt_load_id" in state_2 + + +def assert_github_pipeline_end_state( + pipeline: dlt.Pipeline, orig_schema: TStoredSchema, schema_updates: int +) -> None: + # get tables counts + table_counts = load_table_counts(pipeline, *pipeline.default_schema.data_table_names()) + assert table_counts == {"issues": 100, "issues__assignees": 31, "issues__labels": 34} + dlt_counts = load_table_counts(pipeline, *pipeline.default_schema.dlt_table_names()) + assert dlt_counts == {"_dlt_version": schema_updates, "_dlt_loads": 2, "_dlt_pipeline_state": 2} + + # check state + created_at_value = pipeline.state["sources"]["github"]["resources"]["load_issues"][ + "incremental" + ]["created_at"]["last_value"] + assert isinstance(created_at_value, pendulum.DateTime) + assert created_at_value == pendulum.parse("2023-02-17T09:52:12Z") + pipeline = pipeline.drop() + # print(pipeline.working_dir) + assert pipeline.dataset_name == GITHUB_DATASET + assert pipeline.default_schema_name is None + # sync from destination + pipeline.sync_destination() + # print(pipeline.working_dir) + # we have updated schema + assert pipeline.default_schema.ENGINE_VERSION == 9 + # make sure that schema hash retrieved from the destination is exactly the same as the schema hash that was in storage before the schema was wiped + assert pipeline.default_schema.stored_version_hash == orig_schema["version_hash"] def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: @@ -182,7 +263,7 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: ) print( venv.run_script( - "../tests/pipeline/cases/github_pipeline/github_normalize.py", + "../tests/pipeline/cases/github_pipeline/github_normalize.py" ) ) # switch to current version and make sure the load package loads and schema migrates @@ -192,7 +273,9 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: DuckDbClientConfiguration()._bind_dataset_name(dataset_name=GITHUB_DATASET), sections=("destination", "duckdb"), ) - with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: + with DuckDbSqlClient( + GITHUB_DATASET, duckdb_cfg.credentials, duckdb().capabilities() + ) as client: rows = client.execute_sql("SELECT * FROM issues") assert len(rows) == 70 github_schema = json.loads( @@ -201,7 +284,9 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: ) ) # attach to existing pipeline - pipeline = dlt.attach(GITHUB_PIPELINE_NAME, credentials=duckdb_cfg.credentials) + pipeline = dlt.attach( + GITHUB_PIPELINE_NAME, destination=duckdb(credentials=duckdb_cfg.credentials) + ) # get the schema from schema storage before we sync github_schema = json.loads( test_storage.load( @@ -217,7 +302,7 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: assert pipeline.state["_version_hash"] is not None # but in db there's no hash - we loaded an old package with backward compatible schema with pipeline.sql_client() as client: - rows = client.execute_sql(f"SELECT * FROM {STATE_TABLE_NAME}") + rows = client.execute_sql(f"SELECT * FROM {PIPELINE_STATE_TABLE_NAME}") # no hash assert len(rows[0]) == 5 + 2 assert len(rows) == 1 @@ -227,7 +312,7 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: # this will sync schema to destination pipeline.sync_schema() # we have hash now - rows = client.execute_sql(f"SELECT * FROM {STATE_TABLE_NAME}") + rows = client.execute_sql(f"SELECT * FROM {PIPELINE_STATE_TABLE_NAME}") assert len(rows[0]) == 6 + 2 diff --git a/tests/pipeline/test_import_export_schema.py b/tests/pipeline/test_import_export_schema.py index 6f40e1d1eb..eb36d36ba3 100644 --- a/tests/pipeline/test_import_export_schema.py +++ b/tests/pipeline/test_import_export_schema.py @@ -117,7 +117,7 @@ def test_import_schema_is_respected() -> None: destination=dummy(completed_prob=1), import_schema_path=IMPORT_SCHEMA_PATH, export_schema_path=EXPORT_SCHEMA_PATH, - full_refresh=True, + dev_mode=True, ) p.extract(EXAMPLE_DATA, table_name="person") # starts with import schema v 1 that is dirty -> 2 @@ -153,7 +153,7 @@ def resource(): destination=dummy(completed_prob=1), import_schema_path=IMPORT_SCHEMA_PATH, export_schema_path=EXPORT_SCHEMA_PATH, - full_refresh=True, + dev_mode=True, ) p.run(source()) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index f838f31333..95b97c7666 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -7,7 +7,7 @@ import random import threading from time import sleep -from typing import Any, Tuple, cast +from typing import Any, List, Tuple, cast from tenacity import retry_if_exception, Retrying, stop_after_attempt import pytest @@ -19,6 +19,7 @@ from dlt.common.configuration.specs.aws_credentials import AwsCredentials from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.configuration.specs.gcp_credentials import GcpOAuthCredentials +from dlt.common.data_writers.exceptions import FileImportNotFound, SpecLookupFailed from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import WithStateSync from dlt.common.destination.exceptions import ( @@ -32,6 +33,8 @@ from dlt.common.exceptions import PipelineStateNotAvailable from dlt.common.pipeline import LoadInfo, PipelineContext from dlt.common.runtime.collector import LogCollector +from dlt.common.schema.exceptions import TableIdentifiersFrozen +from dlt.common.schema.typing import TColumnSchema from dlt.common.schema.utils import new_column, new_table from dlt.common.typing import DictStrAny from dlt.common.utils import uniq_id @@ -44,9 +47,11 @@ from dlt.extract import DltResource, DltSource from dlt.extract.extractors import MaterializedEmptyList from dlt.load.exceptions import LoadClientJobFailed +from dlt.normalize.exceptions import NormalizeJobFailed from dlt.pipeline.exceptions import InvalidPipelineName, PipelineNotActive, PipelineStepFailed from dlt.pipeline.helpers import retry_load +from dlt.pipeline.pipeline import Pipeline from tests.common.utils import TEST_SENTRY_DSN from tests.common.configuration.utils import environment from tests.utils import TEST_STORAGE_ROOT, skipifnotwindows @@ -55,7 +60,9 @@ assert_data_table_counts, assert_load_info, airtable_emojis, + assert_only_table_columns, load_data_table_counts, + load_tables_to_dicts, many_delayed, ) @@ -201,7 +208,8 @@ def test_pipeline_context() -> None: assert ctx.pipeline() is p3 assert p3.is_active is True assert p2.is_active is False - assert Container()[DestinationCapabilitiesContext].naming_convention == "snake_case" + # no default naming convention + assert Container()[DestinationCapabilitiesContext].naming_convention is None # restore previous p2 = dlt.attach("another pipeline") @@ -1539,10 +1547,13 @@ def autodetect(): pipeline = pipeline.drop() source = autodetect() + assert "timestamp" in source.schema.settings["detections"] source.schema.remove_type_detection("timestamp") + assert "timestamp" not in source.schema.settings["detections"] pipeline = dlt.pipeline(destination="duckdb") pipeline.run(source) + assert "timestamp" not in pipeline.default_schema.settings["detections"] assert pipeline.default_schema.get_table("numbers")["columns"]["value"]["data_type"] == "bigint" @@ -1969,7 +1980,7 @@ def source(): assert len(load_info.loads_ids) == 1 -def test_pipeline_load_info_metrics_schema_is_not_chaning() -> None: +def test_pipeline_load_info_metrics_schema_is_not_changing() -> None: """Test if load info schema is idempotent throughout multiple load cycles ## Setup @@ -2025,7 +2036,6 @@ def demand_map(): pipeline_name="quick_start", destination="duckdb", dataset_name="mydata", - # export_schema_path="schemas", ) taxi_load_info = pipeline.run( @@ -2243,7 +2253,7 @@ def test_data(): pipeline = dlt.pipeline( pipeline_name="test_staging_cleared", destination="duckdb", - full_refresh=True, + dev_mode=True, ) info = pipeline.run(test_data, table_name="staging_cleared") @@ -2260,3 +2270,198 @@ def test_data(): with client.execute_query(f"SELECT * FROM {pipeline.dataset_name}.staging_cleared") as cur: assert len(cur.fetchall()) == 3 + + +def test_change_naming_convention_name_collision() -> None: + duck_ = dlt.destinations.duckdb(naming_convention="duck_case", recommended_file_size=120000) + caps = duck_.capabilities() + assert caps.naming_convention == "duck_case" + assert caps.recommended_file_size == 120000 + + # use duck case to load data into duckdb so casing and emoji are preserved + pipeline = dlt.pipeline("test_change_naming_convention_name_collision", destination=duck_) + info = pipeline.run( + airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock") + ) + assert_load_info(info) + # make sure that emojis got in + assert "🦚Peacock" in pipeline.default_schema.tables + assert "🔑id" in pipeline.default_schema.tables["🦚Peacock"]["columns"] + assert load_data_table_counts(pipeline) == { + "📆 Schedule": 3, + "🦚Peacock": 1, + "🦚WidePeacock": 1, + "🦚Peacock__peacock": 3, + "🦚WidePeacock__Peacock": 3, + } + with pipeline.sql_client() as client: + rows = client.execute_sql("SELECT 🔑id FROM 🦚Peacock") + # 🔑id value is 1 + assert rows[0][0] == 1 + + # change naming convention and run pipeline again so we generate name clashes + os.environ["SOURCES__AIRTABLE_EMOJIS__SCHEMA__NAMING"] = "sql_ci_v1" + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock")) + assert isinstance(pip_ex.value.__cause__, TableIdentifiersFrozen) + + # all good if we drop tables + # info = pipeline.run( + # airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"), + # refresh="drop_resources", + # ) + # assert_load_info(info) + # assert load_data_table_counts(pipeline) == { + # "📆 Schedule": 3, + # "🦚Peacock": 1, + # "🦚WidePeacock": 1, + # "🦚Peacock__peacock": 3, + # "🦚WidePeacock__Peacock": 3, + # } + + +def test_change_naming_convention_column_collision() -> None: + duck_ = dlt.destinations.duckdb(naming_convention="duck_case") + + data = {"Col": "A"} + pipeline = dlt.pipeline("test_change_naming_convention_column_collision", destination=duck_) + info = pipeline.run([data], table_name="data") + assert_load_info(info) + + os.environ["SCHEMA__NAMING"] = "sql_ci_v1" + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run([data], table_name="data") + assert isinstance(pip_ex.value.__cause__, TableIdentifiersFrozen) + + +def test_import_jsonl_file() -> None: + pipeline = dlt.pipeline( + pipeline_name="test_jsonl_import", + destination="duckdb", + dev_mode=True, + ) + columns: List[TColumnSchema] = [ + {"name": "id", "data_type": "bigint", "nullable": False}, + {"name": "name", "data_type": "text"}, + {"name": "description", "data_type": "text"}, + {"name": "ordered_at", "data_type": "date"}, + {"name": "price", "data_type": "decimal"}, + ] + import_file = "tests/load/cases/loading/header.jsonl" + info = pipeline.run( + [dlt.mark.with_file_import(import_file, "jsonl", 2)], + table_name="no_header", + loader_file_format="jsonl", + columns=columns, + ) + info.raise_on_failed_jobs() + print(info) + assert_imported_file(pipeline, "no_header", columns, 2) + + # use hints to infer + hints = dlt.mark.make_hints(columns=columns) + info = pipeline.run( + [dlt.mark.with_file_import(import_file, "jsonl", 2, hints=hints)], + table_name="no_header_2", + ) + info.raise_on_failed_jobs() + assert_imported_file(pipeline, "no_header_2", columns, 2, expects_state=False) + + +def test_import_file_without_sniff_schema() -> None: + pipeline = dlt.pipeline( + pipeline_name="test_jsonl_import", + destination="duckdb", + dev_mode=True, + ) + import_file = "tests/load/cases/loading/header.jsonl" + info = pipeline.run( + [dlt.mark.with_file_import(import_file, "jsonl", 2)], + table_name="no_header", + ) + assert info.has_failed_jobs + print(info) + + +def test_import_non_existing_file() -> None: + pipeline = dlt.pipeline( + pipeline_name="test_jsonl_import", + destination="duckdb", + dev_mode=True, + ) + # this file does not exist + import_file = "tests/load/cases/loading/X_header.jsonl" + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run( + [dlt.mark.with_file_import(import_file, "jsonl", 2)], + table_name="no_header", + ) + inner_ex = pip_ex.value.__cause__ + assert isinstance(inner_ex, FileImportNotFound) + assert inner_ex.import_file_path == import_file + + +def test_import_unsupported_file_format() -> None: + pipeline = dlt.pipeline( + pipeline_name="test_jsonl_import", + destination="duckdb", + dev_mode=True, + ) + # this file does not exist + import_file = "tests/load/cases/loading/csv_no_header.csv" + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run( + [dlt.mark.with_file_import(import_file, "csv", 2)], + table_name="no_header", + ) + inner_ex = pip_ex.value.__cause__ + assert isinstance(inner_ex, NormalizeJobFailed) + assert isinstance(inner_ex.__cause__, SpecLookupFailed) + + +def test_import_unknown_file_format() -> None: + pipeline = dlt.pipeline( + pipeline_name="test_jsonl_import", + destination="duckdb", + dev_mode=True, + ) + # this file does not exist + import_file = "tests/load/cases/loading/csv_no_header.csv" + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run( + [dlt.mark.with_file_import(import_file, "unknown", 2)], # type: ignore[arg-type] + table_name="no_header", + ) + inner_ex = pip_ex.value.__cause__ + assert isinstance(inner_ex, NormalizeJobFailed) + # can't figure format from extension + assert isinstance(inner_ex.__cause__, ValueError) + + +def assert_imported_file( + pipeline: Pipeline, + table_name: str, + columns: List[TColumnSchema], + expected_rows: int, + expects_state: bool = True, +) -> None: + assert_only_table_columns(pipeline, table_name, [col["name"] for col in columns]) + rows = load_tables_to_dicts(pipeline, table_name) + assert len(rows[table_name]) == expected_rows + # we should have twp files loaded + jobs = pipeline.last_trace.last_load_info.load_packages[0].jobs["completed_jobs"] + job_extensions = [os.path.splitext(job.job_file_info.file_name())[1] for job in jobs] + assert ".jsonl" in job_extensions + if expects_state: + assert ".insert_values" in job_extensions + # check extract trace if jsonl is really there + extract_info = pipeline.last_trace.last_extract_info + jobs = extract_info.load_packages[0].jobs["new_jobs"] + # find jsonl job + jsonl_job = next(job for job in jobs if job.job_file_info.table_name == table_name) + assert jsonl_job.job_file_info.file_format == "jsonl" + # find metrics for table + assert ( + extract_info.metrics[extract_info.loads_ids[0]][0]["table_metrics"][table_name].items_count + == expected_rows + ) diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py index 7208216c9f..308cdcd91d 100644 --- a/tests/pipeline/test_pipeline_extra.py +++ b/tests/pipeline/test_pipeline_extra.py @@ -40,7 +40,11 @@ class BaseModel: # type: ignore[no-redef] @pytest.mark.parametrize( - "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name + "destination_config", + destinations_configs( + default_sql_configs=True, default_vector_configs=True, local_filesystem_configs=True + ), + ids=lambda x: x.name, ) def test_create_pipeline_all_destinations(destination_config: DestinationTestConfiguration) -> None: # create pipelines, extract and normalize. that should be possible without installing any dependencies @@ -51,11 +55,11 @@ def test_create_pipeline_all_destinations(destination_config: DestinationTestCon ) # are capabilities injected caps = p._container[DestinationCapabilitiesContext] - print(caps.naming_convention) - # are right naming conventions created - assert p._default_naming.max_length == min( - caps.max_column_identifier_length, caps.max_identifier_length - ) + if caps.naming_convention: + assert p.naming.name() == caps.naming_convention + else: + assert p.naming.name() == "snake_case" + p.extract([1, "2", 3], table_name="data") # is default schema with right naming convention assert p.default_schema.naming.max_length == min( @@ -469,6 +473,61 @@ def users(): assert set(table.schema.names) == {"id", "name", "_dlt_load_id", "_dlt_id"} +def test_resource_file_format() -> None: + os.environ["RESTORE_FROM_DESTINATION"] = "False" + + def jsonl_data(): + yield [ + { + "id": 1, + "name": "item", + "description": "value", + "ordered_at": "2024-04-12", + "price": 128.4, + }, + { + "id": 1, + "name": "item", + "description": "value with space", + "ordered_at": "2024-04-12", + "price": 128.4, + }, + ] + + # preferred file format will use destination preferred format + jsonl_preferred = dlt.resource(jsonl_data, file_format="preferred", name="jsonl_preferred") + assert jsonl_preferred.compute_table_schema()["file_format"] == "preferred" + + jsonl_r = dlt.resource(jsonl_data, file_format="jsonl", name="jsonl_r") + assert jsonl_r.compute_table_schema()["file_format"] == "jsonl" + + jsonl_pq = dlt.resource(jsonl_data, file_format="parquet", name="jsonl_pq") + assert jsonl_pq.compute_table_schema()["file_format"] == "parquet" + + info = dlt.pipeline("example", destination="duckdb").run([jsonl_preferred, jsonl_r, jsonl_pq]) + info.raise_on_failed_jobs() + # check file types on load jobs + load_jobs = { + job.job_file_info.table_name: job.job_file_info + for job in info.load_packages[0].jobs["completed_jobs"] + } + assert load_jobs["jsonl_r"].file_format == "jsonl" + assert load_jobs["jsonl_pq"].file_format == "parquet" + assert load_jobs["jsonl_preferred"].file_format == "insert_values" + + # test not supported format + csv_r = dlt.resource(jsonl_data, file_format="csv", name="csv_r") + assert csv_r.compute_table_schema()["file_format"] == "csv" + info = dlt.pipeline("example", destination="duckdb").run(csv_r) + info.raise_on_failed_jobs() + # fallback to preferred + load_jobs = { + job.job_file_info.table_name: job.job_file_info + for job in info.load_packages[0].jobs["completed_jobs"] + } + assert load_jobs["csv_r"].file_format == "insert_values" + + def test_pick_matching_file_format(test_storage: FileStorage) -> None: from dlt.destinations import filesystem diff --git a/tests/pipeline/test_pipeline_state.py b/tests/pipeline/test_pipeline_state.py index 8cbc1ca516..11c45d72cc 100644 --- a/tests/pipeline/test_pipeline_state.py +++ b/tests/pipeline/test_pipeline_state.py @@ -1,20 +1,25 @@ import os import shutil +from typing_extensions import get_type_hints import pytest import dlt - +from dlt.common.pendulum import pendulum from dlt.common.exceptions import ( PipelineStateNotAvailable, ResourceNameNotAvailable, ) from dlt.common.schema import Schema +from dlt.common.schema.utils import pipeline_state_table from dlt.common.source import get_current_pipe_name from dlt.common.storages import FileStorage from dlt.common import pipeline as state_module +from dlt.common.storages.load_package import TPipelineStateDoc from dlt.common.utils import uniq_id -from dlt.common.destination.reference import Destination +from dlt.common.destination.reference import Destination, StateInfo +from dlt.common.validation import validate_dict +from dlt.destinations.utils import get_pipeline_state_query_columns from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException, PipelineStepFailed from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.state_sync import ( @@ -41,6 +46,56 @@ def some_data_resource_state(): dlt.current.resource_state()["last_value"] = last_value + 1 +def test_state_repr() -> None: + """Verify that all possible state representations match""" + table = pipeline_state_table() + state_doc_hints = get_type_hints(TPipelineStateDoc) + sync_class_hints = get_type_hints(StateInfo) + info = StateInfo(1, 4, "pipeline", "compressed", pendulum.now(), "hash", "_load_id") + state_doc = info.as_doc() + # just in case hardcode column order + reference_cols = [ + "version", + "engine_version", + "pipeline_name", + "state", + "created_at", + "version_hash", + "_dlt_load_id", + ] + # doc and table must be in the same order with the same name + assert ( + len(table["columns"]) + == len(state_doc_hints) + == len(sync_class_hints) + == len(state_doc) + == len(reference_cols) + ) + for col, hint, class_hint, val, ref_col in zip( + table["columns"].values(), state_doc_hints, sync_class_hints, state_doc, reference_cols + ): + assert col["name"] == hint == class_hint == val == ref_col + + # validate info + validate_dict(TPipelineStateDoc, state_doc, "$") + + info = StateInfo(1, 4, "pipeline", "compressed", pendulum.now()) + state_doc = info.as_doc() + assert "_dlt_load_id" not in state_doc + assert "version_hash" not in state_doc + + # we drop hash in query + compat_table = get_pipeline_state_query_columns() + assert list(compat_table["columns"].keys()) == [ + "version", + "engine_version", + "pipeline_name", + "state", + "created_at", + "_dlt_load_id", + ] + + def test_restore_state_props() -> None: p = dlt.pipeline( pipeline_name="restore_state_props", diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 7affcc5a81..c10618a7cc 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -52,7 +52,7 @@ def peacock(): @dlt.resource(name="🦚WidePeacock", selected=False) def wide_peacock(): - yield [{"peacock": [1, 2, 3]}] + yield [{"Peacock": [1, 2, 3]}] return budget, schedule, peacock, wide_peacock @@ -198,7 +198,7 @@ def _load_tables_to_dicts_sql( for table_name in table_names: table_rows = [] columns = schema.get_table_columns(table_name).keys() - query_columns = ",".join(map(p.sql_client().capabilities.escape_identifier, columns)) + query_columns = ",".join(map(p.sql_client().escape_column_name, columns)) with p.sql_client() as c: query_columns = ",".join(map(c.escape_column_name, columns)) diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 7196ef3436..aa3f02e51d 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -234,7 +234,6 @@ def test_oauth2_client_credentials_flow_wrong_client_secret(self, rest_client: R assert e.type == HTTPError assert e.match("401 Client Error") - def test_oauth_token_expired_refresh(self, rest_client_immediate_oauth_expiry: RESTClient): rest_client = rest_client_immediate_oauth_expiry auth = cast(OAuth2ClientCredentials, rest_client.auth) diff --git a/tests/utils.py b/tests/utils.py index 580c040706..47b6561c8e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -173,7 +173,7 @@ def unload_modules() -> Iterator[None]: @pytest.fixture(autouse=True) -def wipe_pipeline() -> Iterator[None]: +def wipe_pipeline(preserve_environ) -> Iterator[None]: """Wipes pipeline local state and deactivates it""" container = Container() if container[PipelineContext].is_active(): From 8fef07c5659255b24921988621d2a2ea2dd83afa Mon Sep 17 00:00:00 2001 From: David Scharf Date: Thu, 27 Jun 2024 14:36:42 +0200 Subject: [PATCH 39/61] apply forked decorator to example tests (#1522) * apply forked decorator to example tests * add missing pytest import --- docs/tools/prepare_examples_tests.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/tools/prepare_examples_tests.py b/docs/tools/prepare_examples_tests.py index a300b1eb8f..58e56cc15b 100644 --- a/docs/tools/prepare_examples_tests.py +++ b/docs/tools/prepare_examples_tests.py @@ -17,6 +17,8 @@ # some stuff to insert for setting up and tearing down fixtures TEST_HEADER = """ +import pytest + from tests.utils import skipifgithubfork """ @@ -62,7 +64,8 @@ # convert the main clause to a test function if line.startswith(MAIN_CLAUSE): main_clause_found = True - processed_lines.append("@skipifgithubfork") + processed_lines.append("@skipifgithubfork") # skip on forks + processed_lines.append("@pytest.mark.forked") # skip on forks processed_lines.append(f"def test_{example}():") else: processed_lines.append(line) From c00d408d30810be87f81fc109be047586b41e4a1 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Thu, 27 Jun 2024 15:07:35 +0200 Subject: [PATCH 40/61] update default logging to not pad log level (#1517) --- dlt/common/configuration/specs/run_configuration.py | 2 +- tests/common/configuration/test_configuration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlt/common/configuration/specs/run_configuration.py b/dlt/common/configuration/specs/run_configuration.py index ed85aae8ba..6833b7678d 100644 --- a/dlt/common/configuration/specs/run_configuration.py +++ b/dlt/common/configuration/specs/run_configuration.py @@ -17,7 +17,7 @@ class RunConfiguration(BaseConfiguration): dlthub_telemetry: bool = True # enable or disable dlthub telemetry dlthub_telemetry_endpoint: Optional[str] = "https://telemetry.scalevector.ai" dlthub_telemetry_segment_write_key: Optional[str] = None - log_format: str = "{asctime}|[{levelname:<21}]|{process}|{thread}|{name}|{filename}|{funcName}:{lineno}|{message}" + log_format: str = "{asctime}|[{levelname}]|{process}|{thread}|{name}|{filename}|{funcName}:{lineno}|{message}" log_level: str = "WARNING" request_timeout: float = 60 """Timeout for http requests""" diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index 48993971c2..7c3138ea73 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -621,7 +621,7 @@ class _SecretCredentials(RunConfiguration): "dlthub_telemetry": True, "dlthub_telemetry_endpoint": "https://telemetry-tracker.services4758.workers.dev", "dlthub_telemetry_segment_write_key": None, - "log_format": "{asctime}|[{levelname:<21}]|{process}|{thread}|{name}|{filename}|{funcName}:{lineno}|{message}", + "log_format": "{asctime}|[{levelname}]|{process}|{thread}|{name}|{filename}|{funcName}:{lineno}|{message}", "log_level": "WARNING", "request_timeout": 60, "request_max_attempts": 5, From 78cdb0b8a2f3c85236cfccca5c5b30d22514fe67 Mon Sep 17 00:00:00 2001 From: Marcel Coetzee <34739235+Pipboyguy@users.noreply.github.com> Date: Thu, 27 Jun 2024 16:24:53 +0200 Subject: [PATCH 41/61] LanceDB Destination (#1375) * Added lancedb as an optional dependency Signed-off-by: Marcel Coetzee * Added lancedb to dependencies in test workflow Signed-off-by: Marcel Coetzee * Add initial capabilities for LanceDB destination Signed-off-by: Marcel Coetzee * Added new lancedb_adapter Signed-off-by: Marcel Coetzee * Added LanceDB factory in destinations implementation Signed-off-by: Marcel Coetzee * Added LanceDB client configuration with embedding details Signed-off-by: Marcel Coetzee * Added LanceDB Client with data load and schema management functionalities Signed-off-by: Marcel Coetzee * Lockfile Signed-off-by: Marcel Coetzee * Wireframe LanceDB client implementation Signed-off-by: Marcel Coetzee * Add abstract methods Signed-off-by: Marcel Coetzee * Enhance LanceDB client with additional functionality Signed-off-by: Marcel Coetzee * Add tests and GitHub workflow for LanceDB destination Signed-off-by: Marcel Coetzee * Update Python version to 3.11.x in GitHub workflow Signed-off-by: Marcel Coetzee * Refactor and cleanup LanceDBClient and LoadLanceDBJob classes Signed-off-by: Marcel Coetzee * Refactor load tests in lancedb/utils.py and add test for LanceDB model inference Signed-off-by: Marcel Coetzee * Added functionality to infer LanceDB model from data and refactored name for reserved fields Signed-off-by: Marcel Coetzee * Remove storage options Storage options are only available in asynchronous Python API. See https://lancedb.github.io/lancedb/guides/storage/ Signed-off-by: Marcel Coetzee * Refactor test pipeline and implement lancedb_adapter in LanceDBClient Signed-off-by: Marcel Coetzee * Add schema argument to LoadLanceDBJob function Signed-off-by: Marcel Coetzee * Format Signed-off-by: Marcel Coetzee * Refactor LanceDB related code and increase type hint coverage Signed-off-by: Marcel Coetzee * Refactor LanceDB client and tests, enhance DB type mapping Signed-off-by: Marcel Coetzee * Refactor code to improve readability by reducing line breaks Signed-off-by: Marcel Coetzee * Refactor LanceDB client code by adding schema_conversion and utils modules Signed-off-by: Marcel Coetzee * Remove redundant variables in lancedb_client.py Signed-off-by: Marcel Coetzee * Refactor code to improve readability and move environment variable set function to utils.py Signed-off-by: Marcel Coetzee * Refactor LanceDB client implementation and error handling Signed-off-by: Marcel Coetzee * Refactor code for better readability and add type ignore comments Signed-off-by: Marcel Coetzee * Added lancedb as an optional dependency Signed-off-by: Marcel Coetzee * Added lancedb to dependencies in test workflow Signed-off-by: Marcel Coetzee * Add initial capabilities for LanceDB destination Signed-off-by: Marcel Coetzee * Added new lancedb_adapter Signed-off-by: Marcel Coetzee * Added LanceDB factory in destinations implementation Signed-off-by: Marcel Coetzee * Added LanceDB client configuration with embedding details Signed-off-by: Marcel Coetzee * Added LanceDB Client with data load and schema management functionalities Signed-off-by: Marcel Coetzee * Wireframe LanceDB client implementation Signed-off-by: Marcel Coetzee * Add abstract methods Signed-off-by: Marcel Coetzee * Enhance LanceDB client with additional functionality Signed-off-by: Marcel Coetzee * Add tests and GitHub workflow for LanceDB destination Signed-off-by: Marcel Coetzee * Update Python version to 3.11.x in GitHub workflow Signed-off-by: Marcel Coetzee * Refactor and cleanup LanceDBClient and LoadLanceDBJob classes Signed-off-by: Marcel Coetzee * Refactor load tests in lancedb/utils.py and add test for LanceDB model inference Signed-off-by: Marcel Coetzee * Added functionality to infer LanceDB model from data and refactored name for reserved fields Signed-off-by: Marcel Coetzee * Remove storage options Storage options are only available in asynchronous Python API. See https://lancedb.github.io/lancedb/guides/storage/ Signed-off-by: Marcel Coetzee * Refactor test pipeline and implement lancedb_adapter in LanceDBClient Signed-off-by: Marcel Coetzee * Add schema argument to LoadLanceDBJob function Signed-off-by: Marcel Coetzee * Format Signed-off-by: Marcel Coetzee * Refactor LanceDB related code and increase type hint coverage Signed-off-by: Marcel Coetzee * Refactor LanceDB client and tests, enhance DB type mapping Signed-off-by: Marcel Coetzee * Refactor code to improve readability by reducing line breaks Signed-off-by: Marcel Coetzee * Refactor LanceDB client code by adding schema_conversion and utils modules Signed-off-by: Marcel Coetzee * Remove redundant variables in lancedb_client.py Signed-off-by: Marcel Coetzee * Refactor code to improve readability and move environment variable set function to utils.py Signed-off-by: Marcel Coetzee * Refactor LanceDB client implementation and error handling Signed-off-by: Marcel Coetzee * Refactor code for better readability and add type ignore comments Signed-off-by: Marcel Coetzee * Dependency Versioning Signed-off-by: Marcel Coetzee * Remove unnecessary dependencies and update lancedb and pylance versions Signed-off-by: Marcel Coetzee * Silence mypy warnings Signed-off-by: Marcel Coetzee * Revert mypy ignores Signed-off-by: Marcel Coetzee * Revert mypy ignores Signed-off-by: Marcel Coetzee * Fix versioning with 3.8 Signed-off-by: Marcel Coetzee * Fix versioning Signed-off-by: Marcel Coetzee * Update default URI and dataset separator in LanceDB configuration Signed-off-by: Marcel Coetzee * Refactor LanceDB typemapper with timestamp and decimal precision adjustments Signed-off-by: Marcel Coetzee * Updated method for retrieving sentinel table name Signed-off-by: Marcel Coetzee * Remove redundant table normalisation for version_table_name Signed-off-by: Marcel Coetzee * Refactor LanceDB functionalities and improve handling of optional embedding fields Signed-off-by: Marcel Coetzee * Refactor LanceDBClient and update parameter defaults in schema.py Signed-off-by: Marcel Coetzee * Added lancedb to default vector configs and improved type annotations in tests. Signed-off-by: Marcel Coetzee * Return self in enter context manager method Signed-off-by: Marcel Coetzee * Handle FileNotFoundError Signed-off-by: Marcel Coetzee * Replace FileNotFoundError with DestinationUndefinedEntity in lancedb_client.py Signed-off-by: Marcel Coetzee * Refactor LanceDB client for simplified table name handling Signed-off-by: Marcel Coetzee * Refactored LanceDB schema creation and storage update processes to pyarrow Signed-off-by: Marcel Coetzee * Remove LanceModels Signed-off-by: Marcel Coetzee * Ensure 'records' is a list in lancedb_client.py Signed-off-by: Marcel Coetzee * Refactor code and add batch error handling in lancedb client Signed-off-by: Marcel Coetzee * Refactor LanceDB client and schema for improved embedding handling Signed-off-by: Marcel Coetzee * Improve error handling and retries in LanceDB client Signed-off-by: Marcel Coetzee * Add error decorator to get_stored_state method in lancedb_client Signed-off-by: Marcel Coetzee * Change error handling from FileNotFoundError to IndexError Signed-off-by: Marcel Coetzee * Refactor lancedb_client.py and add error decorators Signed-off-by: Marcel Coetzee * Add configurable read consistency to LanceDB client Signed-off-by: Marcel Coetzee * Versioning Signed-off-by: Marcel Coetzee * Refactor code for readability and change return type in tests Signed-off-by: Marcel Coetzee * Update queries in lancedb_client to order by insertion date Signed-off-by: Marcel Coetzee * Refactor LanceDB client and schema for better table creation and management Signed-off-by: Marcel Coetzee * Combine "skip" and "append" write dispositions in batch upload Signed-off-by: Marcel Coetzee * Add schema version hash check in LanceDB client write operations Signed-off-by: Marcel Coetzee * Remove testing code Signed-off-by: Marcel Coetzee * Refactor return statement in lancedb_client for successful state loads Signed-off-by: Marcel Coetzee * Update lancedb_client.py to improve table handling and embedding fields Signed-off-by: Marcel Coetzee * Refactor LanceDB schema generation and handle metadata for embedding functions Signed-off-by: Marcel Coetzee * Refactor schema creation and remove unused code Signed-off-by: Marcel Coetzee * Add mapping for provider environment variables and update schema comment Signed-off-by: Marcel Coetzee * Update package versions in pyproject.toml and poetry.lock Signed-off-by: Marcel Coetzee * Refactor LanceDB utils and client, handle exception and remove unnecessary comment Signed-off-by: Marcel Coetzee * Refactor utility functions in lancedb tests Signed-off-by: Marcel Coetzee * Update 'replace' mode and improve table handling in lancedb client Signed-off-by: Marcel Coetzee * Refactor assert_unordered_list_equal to handle dictionaries Signed-off-by: Marcel Coetzee * Refactor code for better readability and remove unnecessary blank lines Signed-off-by: Marcel Coetzee * Refactor code for readability and remove redundant comments Signed-off-by: Marcel Coetzee * Update sentinel table name in test_pipeline.py Signed-off-by: Marcel Coetzee * "Add order by clause to database query in lancedb_client" Signed-off-by: Marcel Coetzee * Use super method to reduce redundancy Signed-off-by: Marcel Coetzee * Syntax Signed-off-by: Marcel Coetzee * Remove bare except clauses Signed-off-by: Marcel Coetzee * Revert "Remove bare except clauses" This reverts commit 3b446312bd73a73372aa2cb007ed14d2ec13490b. * Remove bare except clause Signed-off-by: Marcel Coetzee * Remove bare except clause Signed-off-by: Marcel Coetzee * Remove bare except clause Signed-off-by: Marcel Coetzee * Remove bare except clause Signed-off-by: Marcel Coetzee * Refactor error handling in LanceDB client Signed-off-by: Marcel Coetzee * Add configurable sentinel table name in LanceDB client configuration Signed-off-by: Marcel Coetzee * Update embedding model config and schema in LanceDB Signed-off-by: Marcel Coetzee * Refactor lancedb_client.py, remove unused methods and imports Signed-off-by: Marcel Coetzee * Add support for adding multiple fields to LanceDB table in a single operation Signed-off-by: Marcel Coetzee * Only filter by successful loads Signed-off-by: Marcel Coetzee * Remove redundant exception handling in JSON extraction Signed-off-by: Marcel Coetzee * Refactor lancedb_client.py for better code readability Signed-off-by: Marcel Coetzee * Refactor lancedb_client.py for improved code readability Signed-off-by: Marcel Coetzee * Fix module docstring Signed-off-by: Marcel Coetzee * Remove embedding_fields from make_arrow_field_schema function Signed-off-by: Marcel Coetzee * Add merge key support Signed-off-by: Marcel Coetzee * Refactor `get_stored_state` to perform join in memory Signed-off-by: Marcel Coetzee * Packaging Signed-off-by: Marcel Coetzee * Format Signed-off-by: Marcel Coetzee * Update dependencies in GitHub workflow for testing lancedb Signed-off-by: Marcel Coetzee * Add "cohere" to package dependencies in pyproject.toml Signed-off-by: Marcel Coetzee * Dependencies Signed-off-by: Marcel Coetzee * Update dependencies installation in GitHub workflow Signed-off-by: Marcel Coetzee * Dependencies Signed-off-by: Marcel Coetzee * Update dependency in GitHub workflow Signed-off-by: Marcel Coetzee * Dependencies Signed-off-by: Marcel Coetzee * Dependencies Signed-off-by: Marcel Coetzee * Add documentation for LanceDB Signed-off-by: Marcel Coetzee * Add limitations Signed-off-by: Marcel Coetzee * Offload ordering logic from LanceDB Signed-off-by: Marcel Coetzee * Update import statements in lancedb client and exceptions files Signed-off-by: Marcel Coetzee * Create `_get_table_name` getter Signed-off-by: Marcel Coetzee * Format Signed-off-by: Marcel Coetzee * Avoid race conditions by delegating all state management to dlt Signed-off-by: Marcel Coetzee * Imports Signed-off-by: Marcel Coetzee * small doc and test fixes * Fix OpenAI embedding handling of empty strings Replace empty strings with a placeholder before sending to the OpenAI API, and handle the placeholder as an empty embedding in the results. This avoids BadRequestErrors from the API when empty strings are present in the input data. Implemented by subclassing OpenAIEmbeddings and overriding sanitize_input and generate_embeddings methods. Signed-off-by: Marcel Coetzee * Add 'embeddings' dependencies manually Signed-off-by: Marcel Coetzee * Finally... Signed-off-by: Marcel Coetzee * Dependencies Signed-off-by: Marcel Coetzee * Dependencies Signed-off-by: Marcel Coetzee * Docs Signed-off-by: Marcel Coetzee * Remove superfluous helper method. Signed-off-by: Marcel Coetzee * Lock File Signed-off-by: Marcel Coetzee * Make api_key and embedding_model_provider_api_key optional Signed-off-by: Marcel Coetzee * Clear environment for config test Signed-off-by: Marcel Coetzee * Minor test config Signed-off-by: Marcel Coetzee * test config Signed-off-by: Marcel Coetzee * lancedb config Signed-off-by: Marcel Coetzee * Config test Signed-off-by: Marcel Coetzee * Config Signed-off-by: Marcel Coetzee * config Signed-off-by: Marcel Coetzee * Import lancedb_adapter function instead of module in adapter collection module Signed-off-by: Marcel Coetzee * Clarify embedding facilities in LanceDB docs Signed-off-by: Marcel Coetzee * update lancedb to support new naming setup (cleanup will follow) * update lockfile --------- Signed-off-by: Marcel Coetzee Co-authored-by: Dave --- .../workflows/test_destination_lancedb.yml | 81 ++ .github/workflows/test_doc_snippets.yml | 2 +- dlt/destinations/__init__.py | 2 + dlt/destinations/adapters.py | 2 + dlt/destinations/impl/lancedb/__init__.py | 1 + .../impl/lancedb/configuration.py | 111 +++ dlt/destinations/impl/lancedb/exceptions.py | 30 + dlt/destinations/impl/lancedb/factory.py | 53 ++ .../impl/lancedb/lancedb_adapter.py | 58 ++ .../impl/lancedb/lancedb_client.py | 767 ++++++++++++++++++ dlt/destinations/impl/lancedb/models.py | 34 + dlt/destinations/impl/lancedb/schema.py | 84 ++ dlt/destinations/impl/lancedb/utils.py | 55 ++ .../dlt-ecosystem/destinations/lancedb.md | 211 +++++ docs/website/sidebars.js | 1 + poetry.lock | 268 ++---- pyproject.toml | 3 + tests/load/lancedb/__init__.py | 3 + tests/load/lancedb/test_config.py | 35 + tests/load/lancedb/test_pipeline.py | 435 ++++++++++ tests/load/lancedb/utils.py | 74 ++ tests/load/qdrant/test_pipeline.py | 2 +- tests/load/utils.py | 6 +- tests/utils.py | 11 +- 24 files changed, 2121 insertions(+), 208 deletions(-) create mode 100644 .github/workflows/test_destination_lancedb.yml create mode 100644 dlt/destinations/impl/lancedb/__init__.py create mode 100644 dlt/destinations/impl/lancedb/configuration.py create mode 100644 dlt/destinations/impl/lancedb/exceptions.py create mode 100644 dlt/destinations/impl/lancedb/factory.py create mode 100644 dlt/destinations/impl/lancedb/lancedb_adapter.py create mode 100644 dlt/destinations/impl/lancedb/lancedb_client.py create mode 100644 dlt/destinations/impl/lancedb/models.py create mode 100644 dlt/destinations/impl/lancedb/schema.py create mode 100644 dlt/destinations/impl/lancedb/utils.py create mode 100644 docs/website/docs/dlt-ecosystem/destinations/lancedb.md create mode 100644 tests/load/lancedb/__init__.py create mode 100644 tests/load/lancedb/test_config.py create mode 100644 tests/load/lancedb/test_pipeline.py create mode 100644 tests/load/lancedb/utils.py diff --git a/.github/workflows/test_destination_lancedb.yml b/.github/workflows/test_destination_lancedb.yml new file mode 100644 index 0000000000..02b5ef66eb --- /dev/null +++ b/.github/workflows/test_destination_lancedb.yml @@ -0,0 +1,81 @@ +name: dest | lancedb + +on: + pull_request: + branches: + - master + - devel + workflow_dispatch: + schedule: + - cron: '0 2 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + DLT_SECRETS_TOML: ${{ secrets.DLT_SECRETS_TOML }} + + RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 + RUNTIME__LOG_LEVEL: ERROR + RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }} + + ACTIVE_DESTINATIONS: "[\"lancedb\"]" + ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" + +jobs: + get_docs_changes: + name: docs changes + uses: ./.github/workflows/get_docs_changes.yml + if: ${{ !github.event.pull_request.head.repo.fork || contains(github.event.pull_request.labels.*.name, 'ci from fork')}} + + run_loader: + name: dest | lancedb tests + needs: get_docs_changes + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + defaults: + run: + shell: bash + runs-on: "ubuntu-latest" + + steps: + - name: Check out + uses: actions/checkout@master + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.11.x" + + - name: Install Poetry + uses: snok/install-poetry@v1.3.2 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp + + - name: create secrets.toml + run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml + + - name: Install dependencies + run: poetry install --no-interaction -E lancedb -E parquet --with sentry-sdk --with pipeline + + - name: Install embedding provider dependencies + run: poetry run pip install openai + + - run: | + poetry run pytest tests/load -m "essential" + name: Run essential tests Linux + if: ${{ ! (contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule')}} + + - run: | + poetry run pytest tests/load + name: Run all tests Linux + if: ${{ contains(github.event.pull_request.labels.*.name, 'ci full') || github.event_name == 'schedule'}} diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index cb6417a4ab..2c51695714 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -61,7 +61,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres --with docs,sentry-sdk --without airflow + run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow - name: create secrets.toml for examples run: pwd && echo "$DLT_SECRETS_TOML" > docs/examples/.dlt/secrets.toml diff --git a/dlt/destinations/__init__.py b/dlt/destinations/__init__.py index 302de24a6b..0546d16bcd 100644 --- a/dlt/destinations/__init__.py +++ b/dlt/destinations/__init__.py @@ -8,6 +8,7 @@ from dlt.destinations.impl.athena.factory import athena from dlt.destinations.impl.redshift.factory import redshift from dlt.destinations.impl.qdrant.factory import qdrant +from dlt.destinations.impl.lancedb.factory import lancedb from dlt.destinations.impl.motherduck.factory import motherduck from dlt.destinations.impl.weaviate.factory import weaviate from dlt.destinations.impl.destination.factory import destination @@ -28,6 +29,7 @@ "athena", "redshift", "qdrant", + "lancedb", "motherduck", "weaviate", "synapse", diff --git a/dlt/destinations/adapters.py b/dlt/destinations/adapters.py index 42d4879653..0cf04b7b59 100644 --- a/dlt/destinations/adapters.py +++ b/dlt/destinations/adapters.py @@ -2,6 +2,7 @@ from dlt.destinations.impl.weaviate.weaviate_adapter import weaviate_adapter from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter +from dlt.destinations.impl.lancedb import lancedb_adapter from dlt.destinations.impl.bigquery.bigquery_adapter import bigquery_adapter from dlt.destinations.impl.synapse.synapse_adapter import synapse_adapter from dlt.destinations.impl.clickhouse.clickhouse_adapter import clickhouse_adapter @@ -10,6 +11,7 @@ __all__ = [ "weaviate_adapter", "qdrant_adapter", + "lancedb_adapter", "bigquery_adapter", "synapse_adapter", "clickhouse_adapter", diff --git a/dlt/destinations/impl/lancedb/__init__.py b/dlt/destinations/impl/lancedb/__init__.py new file mode 100644 index 0000000000..bc6974b072 --- /dev/null +++ b/dlt/destinations/impl/lancedb/__init__.py @@ -0,0 +1 @@ +from dlt.destinations.impl.lancedb.lancedb_adapter import lancedb_adapter diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py new file mode 100644 index 0000000000..ba3a8b49d9 --- /dev/null +++ b/dlt/destinations/impl/lancedb/configuration.py @@ -0,0 +1,111 @@ +import dataclasses +from typing import Optional, Final, Literal, ClassVar, List + +from dlt.common.configuration import configspec +from dlt.common.configuration.specs.base_configuration import ( + BaseConfiguration, + CredentialsConfiguration, +) +from dlt.common.destination.reference import DestinationClientDwhConfiguration +from dlt.common.typing import TSecretStrValue +from dlt.common.utils import digest128 + + +@configspec +class LanceDBCredentials(CredentialsConfiguration): + uri: Optional[str] = ".lancedb" + """LanceDB database URI. Defaults to local, on-disk instance. + + The available schemas are: + + - `/path/to/database` - local database. + - `db://host:port` - remote database (LanceDB cloud). + """ + api_key: Optional[TSecretStrValue] = None + """API key for the remote connections (LanceDB cloud).""" + embedding_model_provider_api_key: Optional[str] = None + """API key for the embedding model provider.""" + + __config_gen_annotations__: ClassVar[List[str]] = [ + "uri", + "api_key", + "embedding_model_provider_api_key", + ] + + +@configspec +class LanceDBClientOptions(BaseConfiguration): + max_retries: Optional[int] = 3 + """`EmbeddingFunction` class wraps the calls for source and query embedding + generation inside a rate limit handler that retries the requests with exponential + backoff after successive failures. + + You can tune it by setting it to a different number, or disable it by setting it to 0.""" + + __config_gen_annotations__: ClassVar[List[str]] = [ + "max_retries", + ] + + +TEmbeddingProvider = Literal[ + "gemini-text", + "bedrock-text", + "cohere", + "gte-text", + "imagebind", + "instructor", + "open-clip", + "openai", + "sentence-transformers", + "huggingface", + "colbert", +] + + +@configspec +class LanceDBClientConfiguration(DestinationClientDwhConfiguration): + destination_type: Final[str] = dataclasses.field( # type: ignore + default="LanceDB", init=False, repr=False, compare=False + ) + credentials: LanceDBCredentials = None + dataset_separator: str = "___" + """Character for the dataset separator.""" + dataset_name: Final[Optional[str]] = dataclasses.field( # type: ignore + default=None, init=False, repr=False, compare=False + ) + + options: Optional[LanceDBClientOptions] = None + """LanceDB client options.""" + + embedding_model_provider: TEmbeddingProvider = "cohere" + """Embedding provider used for generating embeddings. Default is "cohere". You can find the full list of + providers at https://github.com/lancedb/lancedb/tree/main/python/python/lancedb/embeddings as well as + https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/.""" + embedding_model: str = "embed-english-v3.0" + """The model used by the embedding provider for generating embeddings. + Check with the embedding provider which options are available. + Reference https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/.""" + embedding_model_dimensions: Optional[int] = None + """The dimensions of the embeddings generated. In most cases it will be automatically inferred, by LanceDB, + but it is configurable in rare cases. + + Make sure it corresponds with the associated embedding model's dimensionality.""" + vector_field_name: str = "vector__" + """Name of the special field to store the vector embeddings.""" + id_field_name: str = "id__" + """Name of the special field to manage deduplication.""" + sentinel_table_name: str = "dltSentinelTable" + """Name of the sentinel table that encapsulates datasets. Since LanceDB has no + concept of schemas, this table serves as a proxy to group related dlt tables together.""" + + __config_gen_annotations__: ClassVar[List[str]] = [ + "embedding_model", + "embedding_model_provider", + ] + + def fingerprint(self) -> str: + """Returns a fingerprint of a connection string.""" + + if self.credentials and self.credentials.uri: + return digest128(self.credentials.uri) + return "" diff --git a/dlt/destinations/impl/lancedb/exceptions.py b/dlt/destinations/impl/lancedb/exceptions.py new file mode 100644 index 0000000000..35b86ce76c --- /dev/null +++ b/dlt/destinations/impl/lancedb/exceptions.py @@ -0,0 +1,30 @@ +from functools import wraps +from typing import ( + Any, +) + +from lancedb.exceptions import MissingValueError, MissingColumnError # type: ignore + +from dlt.common.destination.exceptions import ( + DestinationUndefinedEntity, + DestinationTerminalException, +) +from dlt.common.destination.reference import JobClientBase +from dlt.common.typing import TFun + + +def lancedb_error(f: TFun) -> TFun: + @wraps(f) + def _wrap(self: JobClientBase, *args: Any, **kwargs: Any) -> Any: + try: + return f(self, *args, **kwargs) + except ( + FileNotFoundError, + MissingValueError, + MissingColumnError, + ) as status_ex: + raise DestinationUndefinedEntity(status_ex) from status_ex + except Exception as e: + raise DestinationTerminalException(e) from e + + return _wrap # type: ignore[return-value] diff --git a/dlt/destinations/impl/lancedb/factory.py b/dlt/destinations/impl/lancedb/factory.py new file mode 100644 index 0000000000..f2e17168b9 --- /dev/null +++ b/dlt/destinations/impl/lancedb/factory.py @@ -0,0 +1,53 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.lancedb.configuration import ( + LanceDBCredentials, + LanceDBClientConfiguration, +) + + +if t.TYPE_CHECKING: + from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient + + +class lancedb(Destination[LanceDBClientConfiguration, "LanceDBClient"]): + spec = LanceDBClientConfiguration + + def _raw_capabilities(self) -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl"] + + caps.max_identifier_length = 200 + caps.max_column_identifier_length = 1024 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 8 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = False + + caps.decimal_precision = (38, 18) + caps.timestamp_precision = 6 + + return caps + + @property + def client_class(self) -> t.Type["LanceDBClient"]: + from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient + + return LanceDBClient + + def __init__( + self, + credentials: t.Union[LanceDBCredentials, t.Dict[str, t.Any]] = None, + destination_name: t.Optional[str] = None, + environment: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + super().__init__( + credentials=credentials, + destination_name=destination_name, + environment=environment, + **kwargs, + ) diff --git a/dlt/destinations/impl/lancedb/lancedb_adapter.py b/dlt/destinations/impl/lancedb/lancedb_adapter.py new file mode 100644 index 0000000000..bb33632b48 --- /dev/null +++ b/dlt/destinations/impl/lancedb/lancedb_adapter.py @@ -0,0 +1,58 @@ +from typing import Any + +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns +from dlt.destinations.utils import ensure_resource +from dlt.extract import DltResource + + +VECTORIZE_HINT = "x-lancedb-embed" + + +def lancedb_adapter( + data: Any, + embed: TColumnNames = None, +) -> DltResource: + """Prepares data for the LanceDB destination by specifying which columns should be embedded. + + Args: + data (Any): The data to be transformed. It can be raw data or an instance + of DltResource. If raw data, the function wraps it into a DltResource + object. + embed (TColumnNames, optional): Specify columns to generate embeddings for. + It can be a single column name as a string, or a list of column names. + + Returns: + DltResource: A resource with applied LanceDB-specific hints. + + Raises: + ValueError: If input for `embed` invalid or empty. + + Examples: + >>> data = [{"name": "Marcel", "description": "Moonbase Engineer"}] + >>> lancedb_adapter(data, embed="description") + [DltResource with hints applied] + """ + resource = ensure_resource(data) + + column_hints: TTableSchemaColumns = {} + + if embed: + if isinstance(embed, str): + embed = [embed] + if not isinstance(embed, list): + raise ValueError( + "'embed' must be a list of column names or a single column name as a string." + ) + + for column_name in embed: + column_hints[column_name] = { + "name": column_name, + VECTORIZE_HINT: True, # type: ignore[misc] + } + + if not column_hints: + raise ValueError("A value for 'embed' must be specified.") + else: + resource.apply_hints(columns=column_hints) + + return resource diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py new file mode 100644 index 0000000000..128e2c7e7e --- /dev/null +++ b/dlt/destinations/impl/lancedb/lancedb_client.py @@ -0,0 +1,767 @@ +import uuid +from types import TracebackType +from typing import ( + ClassVar, + List, + Any, + cast, + Union, + Tuple, + Iterable, + Type, + Optional, + Dict, + Sequence, +) + +import lancedb # type: ignore +import pyarrow as pa +from lancedb import DBConnection +from lancedb.embeddings import EmbeddingFunctionRegistry, TextEmbeddingFunction # type: ignore +from lancedb.query import LanceQueryBuilder # type: ignore +from lancedb.table import Table # type: ignore +from numpy import ndarray +from pyarrow import Array, ChunkedArray, ArrowInvalid + +from dlt.common import json, pendulum, logger +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.exceptions import ( + DestinationUndefinedEntity, + DestinationTransientException, + DestinationTerminalException, +) +from dlt.common.destination.reference import ( + JobClientBase, + WithStateSync, + LoadJob, + StorageSchemaInfo, + StateInfo, + TLoadJobState, +) +from dlt.common.pendulum import timedelta +from dlt.common.schema import Schema, TTableSchema, TSchemaTables +from dlt.common.schema.typing import ( + TColumnType, + TTableFormat, + TTableSchemaColumns, + TWriteDisposition, +) +from dlt.common.schema.utils import get_columns_names_with_prop +from dlt.common.storages import FileStorage +from dlt.common.typing import DictStrAny +from dlt.destinations.impl.lancedb.configuration import ( + LanceDBClientConfiguration, +) +from dlt.destinations.impl.lancedb.exceptions import ( + lancedb_error, +) +from dlt.destinations.impl.lancedb.lancedb_adapter import VECTORIZE_HINT +from dlt.destinations.impl.lancedb.schema import ( + make_arrow_field_schema, + make_arrow_table_schema, + TArrowSchema, + NULL_SCHEMA, + TArrowField, +) +from dlt.destinations.impl.lancedb.utils import ( + list_merge_identifiers, + generate_uuid, + set_non_standard_providers_environment_variables, +) +from dlt.destinations.job_impl import EmptyLoadJob +from dlt.destinations.type_mapping import TypeMapper + + +TIMESTAMP_PRECISION_TO_UNIT: Dict[int, str] = {0: "s", 3: "ms", 6: "us", 9: "ns"} +UNIT_TO_TIMESTAMP_PRECISION: Dict[str, int] = {v: k for k, v in TIMESTAMP_PRECISION_TO_UNIT.items()} + + +class LanceDBTypeMapper(TypeMapper): + sct_to_unbound_dbt = { + "text": pa.string(), + "double": pa.float64(), + "bool": pa.bool_(), + "bigint": pa.int64(), + "binary": pa.binary(), + "date": pa.date32(), + "complex": pa.string(), + } + + sct_to_dbt = {} + + dbt_to_sct = { + pa.string(): "text", + pa.float64(): "double", + pa.bool_(): "bool", + pa.int64(): "bigint", + pa.binary(): "binary", + pa.date32(): "date", + } + + def to_db_decimal_type( + self, precision: Optional[int], scale: Optional[int] + ) -> pa.Decimal128Type: + precision, scale = self.decimal_precision(precision, scale) + return pa.decimal128(precision, scale) + + def to_db_datetime_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> pa.TimestampType: + unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision] + return pa.timestamp(unit, "UTC") + + def to_db_time_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> pa.Time64Type: + unit: str = TIMESTAMP_PRECISION_TO_UNIT[self.capabilities.timestamp_precision] + return pa.time64(unit) + + def from_db_type( + self, + db_type: pa.DataType, + precision: Optional[int] = None, + scale: Optional[int] = None, + ) -> TColumnType: + if isinstance(db_type, pa.TimestampType): + return dict( + data_type="timestamp", + precision=UNIT_TO_TIMESTAMP_PRECISION[db_type.unit], + scale=scale, + ) + if isinstance(db_type, pa.Time64Type): + return dict( + data_type="time", + precision=UNIT_TO_TIMESTAMP_PRECISION[db_type.unit], + scale=scale, + ) + if isinstance(db_type, pa.Decimal128Type): + precision, scale = db_type.precision, db_type.scale + if (precision, scale) == self.capabilities.wei_precision: + return cast(TColumnType, dict(data_type="wei")) + return dict(data_type="decimal", precision=precision, scale=scale) + return super().from_db_type(db_type, precision, scale) + + +def upload_batch( + records: List[DictStrAny], + /, + *, + db_client: DBConnection, + table_name: str, + write_disposition: TWriteDisposition, + id_field_name: Optional[str] = None, +) -> None: + """Inserts records into a LanceDB table with automatic embedding computation. + + Args: + records: The data to be inserted as payload. + db_client: The LanceDB client connection. + table_name: The name of the table to insert into. + id_field_name: The name of the ID field for update/merge operations. + write_disposition: The write disposition - one of 'skip', 'append', 'replace', 'merge'. + + Raises: + ValueError: If the write disposition is unsupported, or `id_field_name` is not + provided for update/merge operations. + """ + + try: + tbl = db_client.open_table(table_name) + tbl.checkout_latest() + except FileNotFoundError as e: + raise DestinationTransientException( + "Couldn't open lancedb database. Batch WILL BE RETRIED" + ) from e + + try: + if write_disposition in ("append", "skip"): + tbl.add(records) + elif write_disposition == "replace": + tbl.add(records, mode="overwrite") + elif write_disposition == "merge": + if not id_field_name: + raise ValueError("To perform a merge update, 'id_field_name' must be specified.") + tbl.merge_insert( + id_field_name + ).when_matched_update_all().when_not_matched_insert_all().execute(records) + else: + raise DestinationTerminalException( + f"Unsupported write disposition {write_disposition} for LanceDB Destination - batch" + " failed AND WILL **NOT** BE RETRIED." + ) + except ArrowInvalid as e: + raise DestinationTerminalException( + "Python and Arrow datatype mismatch - batch failed AND WILL **NOT** BE RETRIED." + ) from e + + +class LanceDBClient(JobClientBase, WithStateSync): + """LanceDB destination handler.""" + + model_func: TextEmbeddingFunction + + def __init__( + self, + schema: Schema, + config: LanceDBClientConfiguration, + capabilities: DestinationCapabilitiesContext, + ) -> None: + super().__init__(schema, config, capabilities) + self.config: LanceDBClientConfiguration = config + self.db_client: DBConnection = lancedb.connect( + uri=self.config.credentials.uri, + api_key=self.config.credentials.api_key, + read_consistency_interval=timedelta(0), + ) + self.registry = EmbeddingFunctionRegistry.get_instance() + self.type_mapper = LanceDBTypeMapper(self.capabilities) + self.sentinel_table_name = config.sentinel_table_name + + embedding_model_provider = self.config.embedding_model_provider + + # LanceDB doesn't provide a standardized way to set API keys across providers. + # Some use ENV variables and others allow passing api key as an argument. + # To account for this, we set provider environment variable as well. + set_non_standard_providers_environment_variables( + embedding_model_provider, + self.config.credentials.embedding_model_provider_api_key, + ) + # Use the monkey-patched implementation if openai was chosen. + if embedding_model_provider == "openai": + from dlt.destinations.impl.lancedb.models import PatchedOpenAIEmbeddings + + self.model_func = PatchedOpenAIEmbeddings( + max_retries=self.config.options.max_retries, + api_key=self.config.credentials.api_key, + ) + else: + self.model_func = self.registry.get(embedding_model_provider).create( + name=self.config.embedding_model, + max_retries=self.config.options.max_retries, + api_key=self.config.credentials.api_key, + ) + + self.vector_field_name = self.config.vector_field_name + self.id_field_name = self.config.id_field_name + + @property + def dataset_name(self) -> str: + return self.config.normalize_dataset_name(self.schema) + + @property + def sentinel_table(self) -> str: + return self.make_qualified_table_name(self.sentinel_table_name) + + def make_qualified_table_name(self, table_name: str) -> str: + return ( + f"{self.dataset_name}{self.config.dataset_separator}{table_name}" + if self.dataset_name + else table_name + ) + + def get_table_schema(self, table_name: str) -> TArrowSchema: + schema_table: Table = self.db_client.open_table(table_name) + schema_table.checkout_latest() + schema = schema_table.schema + return cast( + TArrowSchema, + schema, + ) + + @lancedb_error + def create_table(self, table_name: str, schema: TArrowSchema, mode: str = "create") -> Table: + """Create a LanceDB Table from the provided LanceModel or PyArrow schema. + + Args: + schema: The table schema to create. + table_name: The name of the table to create. + mode (): The mode to use when creating the table. Can be either "create" or "overwrite". + By default, if the table already exists, an exception is raised. + If you want to overwrite the table, use mode="overwrite". + """ + return self.db_client.create_table(table_name, schema=schema, mode=mode) + + def delete_table(self, table_name: str) -> None: + """Delete a LanceDB table. + + Args: + table_name: The name of the table to delete. + """ + self.db_client.drop_table(table_name) + + def query_table( + self, + table_name: str, + query: Union[ + List[Any], ndarray[Any, Any], Array, ChunkedArray, str, Tuple[Any], None + ] = None, + ) -> LanceQueryBuilder: + """Query a LanceDB table. + + Args: + table_name: The name of the table to query. + query: The targeted vector to search for. + + Returns: + A LanceDB query builder. + """ + query_table: Table = self.db_client.open_table(table_name) + query_table.checkout_latest() + return query_table.search(query=query) + + @lancedb_error + def _get_table_names(self) -> List[str]: + """Return all tables in the dataset, excluding the sentinel table.""" + if self.dataset_name: + prefix = f"{self.dataset_name}{self.config.dataset_separator}" + table_names = [ + table_name + for table_name in self.db_client.table_names() + if table_name.startswith(prefix) + ] + else: + table_names = self.db_client.table_names() + + return [table_name for table_name in table_names if table_name != self.sentinel_table] + + @lancedb_error + def drop_storage(self) -> None: + """Drop the dataset from the LanceDB instance. + + Deletes all tables in the dataset and all data, as well as sentinel table associated with them. + + If the dataset name was not provided, it deletes all the tables in the current schema. + """ + for table_name in self._get_table_names(): + self.db_client.drop_table(table_name) + + self._delete_sentinel_table() + + @lancedb_error + def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: + if not self.is_storage_initialized(): + self._create_sentinel_table() + elif truncate_tables: + for table_name in truncate_tables: + fq_table_name = self.make_qualified_table_name(table_name) + if not self.table_exists(fq_table_name): + continue + schema = self.get_table_schema(fq_table_name) + self.db_client.drop_table(fq_table_name) + self.create_table( + table_name=fq_table_name, + schema=schema, + ) + + @lancedb_error + def is_storage_initialized(self) -> bool: + return self.table_exists(self.sentinel_table) + + def _create_sentinel_table(self) -> Table: + """Create an empty table to indicate that the storage is initialized.""" + return self.create_table(schema=NULL_SCHEMA, table_name=self.sentinel_table) + + def _delete_sentinel_table(self) -> None: + """Delete the sentinel table.""" + self.db_client.drop_table(self.sentinel_table) + + @lancedb_error + def update_stored_schema( + self, + only_tables: Iterable[str] = None, + expected_update: TSchemaTables = None, + ) -> Optional[TSchemaTables]: + super().update_stored_schema(only_tables, expected_update) + applied_update: TSchemaTables = {} + + try: + schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) + except DestinationUndefinedEntity: + schema_info = None + + if schema_info is None: + logger.info( + f"Schema with hash {self.schema.stored_version_hash} " + "not found in the storage. upgrading" + ) + self._execute_schema_update(only_tables) + else: + logger.info( + f"Schema with hash {self.schema.stored_version_hash} " + f"inserted at {schema_info.inserted_at} found " + "in storage, no upgrade required" + ) + return applied_update + + def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: + table_schema: TTableSchemaColumns = {} + + try: + fq_table_name = self.make_qualified_table_name(table_name) + + table: Table = self.db_client.open_table(fq_table_name) + table.checkout_latest() + arrow_schema: TArrowSchema = table.schema + except FileNotFoundError: + return False, table_schema + + field: TArrowField + for field in arrow_schema: + name = self.schema.naming.normalize_identifier(field.name) + print(field.type) + print(field.name) + table_schema[name] = { + "name": name, + **self.type_mapper.from_db_type(field.type), + } + return True, table_schema + + @lancedb_error + def add_table_fields( + self, table_name: str, field_schemas: List[TArrowField] + ) -> Optional[Table]: + """Add multiple fields to the LanceDB table at once. + + Args: + table_name: The name of the table to create the fields on. + field_schemas: The list of fields to create. + """ + table: Table = self.db_client.open_table(table_name) + table.checkout_latest() + arrow_table = table.to_arrow() + + # Check if any of the new fields already exist in the table. + existing_fields = set(arrow_table.schema.names) + new_fields = [field for field in field_schemas if field.name not in existing_fields] + + if not new_fields: + # All fields already present, skip. + return None + + null_arrays = [pa.nulls(len(arrow_table), type=field.type) for field in new_fields] + + for field, null_array in zip(new_fields, null_arrays): + arrow_table = arrow_table.append_column(field, null_array) + + try: + return self.db_client.create_table(table_name, arrow_table, mode="overwrite") + except OSError: + # Error occurred while creating the table, skip. + return None + + def _execute_schema_update(self, only_tables: Iterable[str]) -> None: + for table_name in only_tables or self.schema.tables: + exists, existing_columns = self.get_storage_table(table_name) + new_columns = self.schema.get_new_table_columns(table_name, existing_columns) + print(table_name) + print(new_columns) + embedding_fields: List[str] = get_columns_names_with_prop( + self.schema.get_table(table_name), VECTORIZE_HINT + ) + logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}") + if len(new_columns) > 0: + if exists: + field_schemas: List[TArrowField] = [ + make_arrow_field_schema(column["name"], column, self.type_mapper) + for column in new_columns + ] + fq_table_name = self.make_qualified_table_name(table_name) + self.add_table_fields(fq_table_name, field_schemas) + else: + if table_name not in self.schema.dlt_table_names(): + embedding_fields = get_columns_names_with_prop( + self.schema.get_table(table_name=table_name), VECTORIZE_HINT + ) + vector_field_name = self.vector_field_name + id_field_name = self.id_field_name + embedding_model_func = self.model_func + embedding_model_dimensions = self.config.embedding_model_dimensions + else: + embedding_fields = None + vector_field_name = None + id_field_name = None + embedding_model_func = None + embedding_model_dimensions = None + + table_schema: TArrowSchema = make_arrow_table_schema( + table_name, + schema=self.schema, + type_mapper=self.type_mapper, + embedding_fields=embedding_fields, + embedding_model_func=embedding_model_func, + embedding_model_dimensions=embedding_model_dimensions, + vector_field_name=vector_field_name, + id_field_name=id_field_name, + ) + fq_table_name = self.make_qualified_table_name(table_name) + self.create_table(fq_table_name, table_schema) + + self.update_schema_in_storage() + + @lancedb_error + def update_schema_in_storage(self) -> None: + records = [ + { + self.schema.naming.normalize_identifier("version"): self.schema.version, + self.schema.naming.normalize_identifier( + "engine_version" + ): self.schema.ENGINE_VERSION, + self.schema.naming.normalize_identifier("inserted_at"): str(pendulum.now()), + self.schema.naming.normalize_identifier("schema_name"): self.schema.name, + self.schema.naming.normalize_identifier( + "version_hash" + ): self.schema.stored_version_hash, + self.schema.naming.normalize_identifier("schema"): json.dumps( + self.schema.to_dict() + ), + } + ] + fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) + write_disposition = self.schema.get_table(self.schema.version_table_name).get( + "write_disposition" + ) + print("UPLOAD") + upload_batch( + records, + db_client=self.db_client, + table_name=fq_version_table_name, + write_disposition=write_disposition, + ) + + @lancedb_error + def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: + """Retrieves the latest completed state for a pipeline.""" + fq_state_table_name = self.make_qualified_table_name(self.schema.state_table_name) + fq_loads_table_name = self.make_qualified_table_name(self.schema.loads_table_name) + + state_table_: Table = self.db_client.open_table(fq_state_table_name) + state_table_.checkout_latest() + + loads_table_: Table = self.db_client.open_table(fq_loads_table_name) + loads_table_.checkout_latest() + + # normalize property names + p_load_id = self.schema.naming.normalize_identifier("load_id") + p_dlt_load_id = self.schema.naming.normalize_identifier("_dlt_load_id") + p_pipeline_name = self.schema.naming.normalize_identifier("pipeline_name") + p_status = self.schema.naming.normalize_identifier("status") + p_version = self.schema.naming.normalize_identifier("version") + p_engine_version = self.schema.naming.normalize_identifier("engine_version") + p_state = self.schema.naming.normalize_identifier("state") + p_created_at = self.schema.naming.normalize_identifier("created_at") + p_version_hash = self.schema.naming.normalize_identifier("version_hash") + + # Read the tables into memory as Arrow tables, with pushdown predicates, so we pull as less + # data into memory as possible. + state_table = ( + state_table_.search() + .where(f"`{p_pipeline_name}` = '{pipeline_name}'", prefilter=True) + .to_arrow() + ) + loads_table = loads_table_.search().where(f"`{p_status}` = 0", prefilter=True).to_arrow() + + # Join arrow tables in-memory. + joined_table: pa.Table = state_table.join( + loads_table, keys=p_dlt_load_id, right_keys=p_load_id, join_type="inner" + ).sort_by([(p_dlt_load_id, "descending")]) + + if joined_table.num_rows == 0: + return None + + state = joined_table.take([0]).to_pylist()[0] + return StateInfo( + version=state[p_version], + engine_version=state[p_engine_version], + pipeline_name=state[p_pipeline_name], + state=state[p_state], + created_at=pendulum.instance(state[p_created_at]), + version_hash=state[p_version_hash], + _dlt_load_id=state[p_dlt_load_id], + ) + + @lancedb_error + def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]: + fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) + + version_table: Table = self.db_client.open_table(fq_version_table_name) + version_table.checkout_latest() + p_version_hash = self.schema.naming.normalize_identifier("version_hash") + p_inserted_at = self.schema.naming.normalize_identifier("inserted_at") + p_schema_name = self.schema.naming.normalize_identifier("schema_name") + p_version = self.schema.naming.normalize_identifier("version") + p_engine_version = self.schema.naming.normalize_identifier("engine_version") + p_schema = self.schema.naming.normalize_identifier("schema") + + try: + schemas = ( + version_table.search().where( + f'`{p_version_hash}` = "{schema_hash}"', prefilter=True + ) + ).to_list() + + # LanceDB's ORDER BY clause doesn't seem to work. + # See https://github.com/dlt-hub/dlt/pull/1375#issuecomment-2171909341 + most_recent_schema = sorted(schemas, key=lambda x: x[p_inserted_at], reverse=True)[0] + return StorageSchemaInfo( + version_hash=most_recent_schema[p_version_hash], + schema_name=most_recent_schema[p_schema_name], + version=most_recent_schema[p_version], + engine_version=most_recent_schema[p_engine_version], + inserted_at=most_recent_schema[p_inserted_at], + schema=most_recent_schema[p_schema], + ) + except IndexError: + return None + + @lancedb_error + def get_stored_schema(self) -> Optional[StorageSchemaInfo]: + """Retrieves newest schema from destination storage.""" + fq_version_table_name = self.make_qualified_table_name(self.schema.version_table_name) + + version_table: Table = self.db_client.open_table(fq_version_table_name) + version_table.checkout_latest() + p_version_hash = self.schema.naming.normalize_identifier("version_hash") + p_inserted_at = self.schema.naming.normalize_identifier("inserted_at") + p_schema_name = self.schema.naming.normalize_identifier("schema_name") + p_version = self.schema.naming.normalize_identifier("version") + p_engine_version = self.schema.naming.normalize_identifier("engine_version") + p_schema = self.schema.naming.normalize_identifier("schema") + + try: + schemas = ( + version_table.search().where( + f'`{p_schema_name}` = "{self.schema.name}"', prefilter=True + ) + ).to_list() + + # LanceDB's ORDER BY clause doesn't seem to work. + # See https://github.com/dlt-hub/dlt/pull/1375#issuecomment-2171909341 + most_recent_schema = sorted(schemas, key=lambda x: x[p_inserted_at], reverse=True)[0] + return StorageSchemaInfo( + version_hash=most_recent_schema[p_version_hash], + schema_name=most_recent_schema[p_schema_name], + version=most_recent_schema[p_version], + engine_version=most_recent_schema[p_engine_version], + inserted_at=most_recent_schema[p_inserted_at], + schema=most_recent_schema[p_schema], + ) + except IndexError: + return None + + def __exit__( + self, + exc_type: Type[BaseException], + exc_val: BaseException, + exc_tb: TracebackType, + ) -> None: + pass + + def __enter__(self) -> "LanceDBClient": + return self + + @lancedb_error + def complete_load(self, load_id: str) -> None: + records = [ + { + self.schema.naming.normalize_identifier("load_id"): load_id, + self.schema.naming.normalize_identifier("schema_name"): self.schema.name, + self.schema.naming.normalize_identifier("status"): 0, + self.schema.naming.normalize_identifier("inserted_at"): str(pendulum.now()), + self.schema.naming.normalize_identifier( + "schema_version_hash" + ): None, # Payload schema must match the target schema. + } + ] + fq_loads_table_name = self.make_qualified_table_name(self.schema.loads_table_name) + write_disposition = self.schema.get_table(self.schema.loads_table_name).get( + "write_disposition" + ) + upload_batch( + records, + db_client=self.db_client, + table_name=fq_loads_table_name, + write_disposition=write_disposition, + ) + + def restore_file_load(self, file_path: str) -> LoadJob: + return EmptyLoadJob.from_file_path(file_path, "completed") + + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: + return LoadLanceDBJob( + self.schema, + table, + file_path, + type_mapper=self.type_mapper, + db_client=self.db_client, + client_config=self.config, + model_func=self.model_func, + fq_table_name=self.make_qualified_table_name(table["name"]), + ) + + def table_exists(self, table_name: str) -> bool: + return table_name in self.db_client.table_names() + + +class LoadLanceDBJob(LoadJob): + arrow_schema: TArrowSchema + + def __init__( + self, + schema: Schema, + table_schema: TTableSchema, + local_path: str, + type_mapper: LanceDBTypeMapper, + db_client: DBConnection, + client_config: LanceDBClientConfiguration, + model_func: TextEmbeddingFunction, + fq_table_name: str, + ) -> None: + file_name = FileStorage.get_file_name_from_file_path(local_path) + super().__init__(file_name) + self.schema: Schema = schema + self.table_schema: TTableSchema = table_schema + self.db_client: DBConnection = db_client + self.type_mapper: TypeMapper = type_mapper + self.table_name: str = table_schema["name"] + self.fq_table_name: str = fq_table_name + self.unique_identifiers: Sequence[str] = list_merge_identifiers(table_schema) + self.embedding_fields: List[str] = get_columns_names_with_prop(table_schema, VECTORIZE_HINT) + self.embedding_model_func: TextEmbeddingFunction = model_func + self.embedding_model_dimensions: int = client_config.embedding_model_dimensions + self.id_field_name: str = client_config.id_field_name + self.write_disposition: TWriteDisposition = cast( + TWriteDisposition, self.table_schema.get("write_disposition", "append") + ) + + with FileStorage.open_zipsafe_ro(local_path) as f: + records: List[DictStrAny] = [json.loads(line) for line in f] + + if self.table_schema not in self.schema.dlt_tables(): + for record in records: + # Add reserved ID fields. + uuid_id = ( + generate_uuid(record, self.unique_identifiers, self.fq_table_name) + if self.unique_identifiers + else str(uuid.uuid4()) + ) + record.update({self.id_field_name: uuid_id}) + + # LanceDB expects all fields in the target arrow table to be present in the data payload. + # We add and set these missing fields, that are fields not present in the target schema, to NULL. + missing_fields = set(self.table_schema["columns"]) - set(record) + for field in missing_fields: + record[field] = None + + upload_batch( + records, + db_client=db_client, + table_name=self.fq_table_name, + write_disposition=self.write_disposition, + id_field_name=self.id_field_name, + ) + + def state(self) -> TLoadJobState: + return "completed" + + def exception(self) -> str: + raise NotImplementedError() diff --git a/dlt/destinations/impl/lancedb/models.py b/dlt/destinations/impl/lancedb/models.py new file mode 100644 index 0000000000..d90adb62bd --- /dev/null +++ b/dlt/destinations/impl/lancedb/models.py @@ -0,0 +1,34 @@ +from typing import Union, List + +import numpy as np +from lancedb.embeddings import OpenAIEmbeddings # type: ignore +from lancedb.embeddings.registry import register # type: ignore +from lancedb.embeddings.utils import TEXT # type: ignore + + +@register("openai_patched") +class PatchedOpenAIEmbeddings(OpenAIEmbeddings): + EMPTY_STRING_PLACEHOLDER: str = "___EMPTY___" + + def sanitize_input(self, texts: TEXT) -> Union[List[str], np.ndarray]: # type: ignore[type-arg] + """ + Replace empty strings with a placeholder value. + """ + + sanitized_texts = super().sanitize_input(texts) + return [self.EMPTY_STRING_PLACEHOLDER if item == "" else item for item in sanitized_texts] + + def generate_embeddings( + self, + texts: Union[List[str], np.ndarray], # type: ignore[type-arg] + ) -> List[np.array]: # type: ignore[valid-type] + """ + Generate embeddings, treating the placeholder as an empty result. + """ + embeddings: List[np.array] = super().generate_embeddings(texts) # type: ignore[valid-type] + + for i, text in enumerate(texts): + if text == self.EMPTY_STRING_PLACEHOLDER: + embeddings[i] = np.zeros(self.ndims()) + + return embeddings diff --git a/dlt/destinations/impl/lancedb/schema.py b/dlt/destinations/impl/lancedb/schema.py new file mode 100644 index 0000000000..c7cceec274 --- /dev/null +++ b/dlt/destinations/impl/lancedb/schema.py @@ -0,0 +1,84 @@ +"""Utilities for creating arrow schemas from table schemas.""" + +from dlt.common.json import json +from typing import ( + List, + cast, + Optional, +) + +import pyarrow as pa +from lancedb.embeddings import TextEmbeddingFunction # type: ignore +from typing_extensions import TypeAlias + +from dlt.common.schema import Schema, TColumnSchema +from dlt.common.typing import DictStrAny +from dlt.destinations.type_mapping import TypeMapper + + +TArrowSchema: TypeAlias = pa.Schema +TArrowDataType: TypeAlias = pa.DataType +TArrowField: TypeAlias = pa.Field +NULL_SCHEMA: TArrowSchema = pa.schema([]) +"""Empty pyarrow Schema with no fields.""" + + +def arrow_schema_to_dict(schema: TArrowSchema) -> DictStrAny: + return {field.name: field.type for field in schema} + + +def make_arrow_field_schema( + column_name: str, + column: TColumnSchema, + type_mapper: TypeMapper, +) -> TArrowField: + """Creates a PyArrow field from a dlt column schema.""" + dtype = cast(TArrowDataType, type_mapper.to_db_type(column)) + return pa.field(column_name, dtype) + + +def make_arrow_table_schema( + table_name: str, + schema: Schema, + type_mapper: TypeMapper, + id_field_name: Optional[str] = None, + vector_field_name: Optional[str] = None, + embedding_fields: Optional[List[str]] = None, + embedding_model_func: Optional[TextEmbeddingFunction] = None, + embedding_model_dimensions: Optional[int] = None, +) -> TArrowSchema: + """Creates a PyArrow schema from a dlt schema.""" + arrow_schema: List[TArrowField] = [] + + if id_field_name: + arrow_schema.append(pa.field(id_field_name, pa.string())) + + if embedding_fields: + # User's provided dimension config, if provided, takes precedence. + vec_size = embedding_model_dimensions or embedding_model_func.ndims() + arrow_schema.append(pa.field(vector_field_name, pa.list_(pa.float32(), vec_size))) + + for column_name, column in schema.get_table_columns(table_name).items(): + field = make_arrow_field_schema(column_name, column, type_mapper) + arrow_schema.append(field) + + metadata = {} + if embedding_model_func: + # Get the registered alias if it exists, otherwise use the class name. + name = getattr( + embedding_model_func, + "__embedding_function_registry_alias__", + embedding_model_func.__class__.__name__, + ) + embedding_functions = [ + { + "source_column": source_column, + "vector_column": vector_field_name, + "name": name, + "model": embedding_model_func.safe_model_dump(), + } + for source_column in embedding_fields + ] + metadata["embedding_functions"] = json.dumps(embedding_functions).encode("utf-8") + + return pa.schema(arrow_schema, metadata=metadata) diff --git a/dlt/destinations/impl/lancedb/utils.py b/dlt/destinations/impl/lancedb/utils.py new file mode 100644 index 0000000000..aeacd4d34b --- /dev/null +++ b/dlt/destinations/impl/lancedb/utils.py @@ -0,0 +1,55 @@ +import os +import uuid +from typing import Sequence, Union, Dict + +from dlt.common.schema import TTableSchema +from dlt.common.schema.utils import get_columns_names_with_prop +from dlt.common.typing import DictStrAny +from dlt.destinations.impl.lancedb.configuration import TEmbeddingProvider + + +PROVIDER_ENVIRONMENT_VARIABLES_MAP: Dict[TEmbeddingProvider, str] = { + "cohere": "COHERE_API_KEY", + "gemini-text": "GOOGLE_API_KEY", + "openai": "OPENAI_API_KEY", + "huggingface": "HUGGINGFACE_API_KEY", +} + + +def generate_uuid(data: DictStrAny, unique_identifiers: Sequence[str], table_name: str) -> str: + """Generates deterministic UUID - used for deduplication. + + Args: + data (Dict[str, Any]): Arbitrary data to generate UUID for. + unique_identifiers (Sequence[str]): A list of unique identifiers. + table_name (str): LanceDB table name. + + Returns: + str: A string representation of the generated UUID. + """ + data_id = "_".join(str(data[key]) for key in unique_identifiers) + return str(uuid.uuid5(uuid.NAMESPACE_DNS, table_name + data_id)) + + +def list_merge_identifiers(table_schema: TTableSchema) -> Sequence[str]: + """Returns a list of merge keys for a table used for either merging or deduplication. + + Args: + table_schema (TTableSchema): a dlt table schema. + + Returns: + Sequence[str]: A list of unique column identifiers. + """ + if table_schema.get("write_disposition") == "merge": + primary_keys = get_columns_names_with_prop(table_schema, "primary_key") + merge_keys = get_columns_names_with_prop(table_schema, "merge_key") + if join_keys := list(set(primary_keys + merge_keys)): + return join_keys + return get_columns_names_with_prop(table_schema, "unique") + + +def set_non_standard_providers_environment_variables( + embedding_model_provider: TEmbeddingProvider, api_key: Union[str, None] +) -> None: + if embedding_model_provider in PROVIDER_ENVIRONMENT_VARIABLES_MAP: + os.environ[PROVIDER_ENVIRONMENT_VARIABLES_MAP[embedding_model_provider]] = api_key or "" diff --git a/docs/website/docs/dlt-ecosystem/destinations/lancedb.md b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md new file mode 100644 index 0000000000..dbf90da4b9 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/lancedb.md @@ -0,0 +1,211 @@ +--- +title: LanceDB +description: LanceDB is an open source vector database that can be used as a destination in dlt. +keywords: [ lancedb, vector database, destination, dlt ] +--- + +# LanceDB + +[LanceDB](https://lancedb.com/) is an open-source, high-performance vector database. It allows you to store data objects and perform similarity searches over them. +This destination helps you load data into LanceDB from [dlt resources](../../general-usage/resource.md). + +## Setup Guide + +### Choosing a Model Provider + +First, you need to decide which embedding model provider to use. You can find all supported providers by visiting the official [LanceDB docs](https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/). + + +### Install dlt with LanceDB + +To use LanceDB as a destination, make sure `dlt` is installed with the `lancedb` extra: + +```sh +pip install "dlt[lancedb]" +``` + +the lancedb extra only installs `dlt` and `lancedb`. You will need to install your model provider's SDK. + +You can find which libraries you need to also referring to the [LanceDB docs](https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/). + +### Configure the destination + +Configure the destination in the dlt secrets file located at `~/.dlt/secrets.toml` by default. Add the following section: + +```toml +[destination.lancedb] +embedding_model_provider = "cohere" +embedding_model = "embed-english-v3.0" +[destination.lancedb.credentials] +uri = ".lancedb" +api_key = "api_key" # API key to connect to LanceDB Cloud. Leave out if you are using LanceDB OSS. +embedding_model_provider_api_key = "embedding_model_provider_api_key" # Not needed for providers that don't need authentication (ollama, sentence-transformers). +``` + +- The `uri` specifies the location of your LanceDB instance. It defaults to a local, on-disk instance if not provided. +- The `api_key` is your api key for LanceDB Cloud connections. If you're using LanceDB OSS, you don't need to supply this key. +- The `embedding_model_provider` specifies the embedding provider used for generating embeddings. The default is `cohere`. +- The `embedding_model` specifies the model used by the embedding provider for generating embeddings. + Check with the embedding provider which options are available. + Reference https://lancedb.github.io/lancedb/embeddings/default_embedding_functions/. +- The `embedding_model_provider_api_key` is the API key for the embedding model provider used to generate embeddings. If you're using a provider that doesn't need authentication, say ollama, you don't need to supply this key. + +:::info Available Model Providers +- "gemini-text" +- "bedrock-text" +- "cohere" +- "gte-text" +- "imagebind" +- "instructor" +- "open-clip" +- "openai" +- "sentence-transformers" +- "huggingface" +- "colbert" +::: + +### Define your data source + +For example: + +```py +import dlt +from dlt.destinations.adapters import lancedb_adapter + + +movies = [ + { + "id": 1, + "title": "Blade Runner", + "year": 1982, + }, + { + "id": 2, + "title": "Ghost in the Shell", + "year": 1995, + }, + { + "id": 3, + "title": "The Matrix", + "year": 1999, + }, +] +``` + +### Create a pipeline: + +```py +pipeline = dlt.pipeline( + pipeline_name="movies", + destination="lancedb", + dataset_name="MoviesDataset", +) +``` + +### Run the pipeline: + +```py +info = pipeline.run( + lancedb_adapter( + movies, + embed="title", + ) +) +``` + +The data is now loaded into LanceDB. + +To use **vector search** after loading, you **must specify which fields LanceDB should generate embeddings for**. Do this by wrapping the data (or dlt resource) with the **`lancedb_adapter`** +function. + +## Using an Adapter to Specify Columns to Vectorise + +Out of the box, LanceDB will act as a normal database. To use LanceDB's embedding facilities, you'll need to specify which fields you'd like to embed in your dlt resource. + +The `lancedb_adapter` is a helper function that configures the resource for the LanceDB destination: + +```py +lancedb_adapter(data, embed) +``` + +It accepts the following arguments: + +- `data`: a dlt resource object, or a Python data structure (e.g. a list of dictionaries). +- `embed`: a name of the field or a list of names to generate embeddings for. + +Returns: [dlt resource](../../general-usage/resource.md) object that you can pass to the `pipeline.run()`. + +Example: + +```py +lancedb_adapter( + resource, + embed=["title", "description"], +) +``` + +Bear in mind that you can't use an adapter on a [dlt source](../../general-usage/source.md), only a [dlt resource](../../general-usage/resource.md). + +## Write disposition + +All [write dispositions](../../general-usage/incremental-loading.md#choosing-a-write-disposition) are supported by the LanceDB destination. + +### Replace + +The [replace](../../general-usage/full-loading.md) disposition replaces the data in the destination with the data from the resource. + +```py +info = pipeline.run( + lancedb_adapter( + movies, + embed="title", + ), + write_disposition="replace", +) +``` + +### Merge + +The [merge](../../general-usage/incremental-loading.md) write disposition merges the data from the resource with the data at the destination based on a unique identifier. + +```py +pipeline.run( + lancedb_adapter( + movies, + embed="title", + ), + write_disposition="merge", + primary_key="id", +) +``` + +### Append + +This is the default disposition. It will append the data to the existing data in the destination. + +## Additional Destination Options + +- `dataset_separator`: The character used to separate the dataset name from table names. Defaults to "___". +- `vector_field_name`: The name of the special field to store vector embeddings. Defaults to "vector__". +- `id_field_name`: The name of the special field used for deduplication and merging. Defaults to "id__". +- `max_retries`: The maximum number of retries for embedding operations. Set to 0 to disable retries. Defaults to 3. + + +## dbt support + +The LanceDB destination doesn't support dbt integration. + +## Syncing of `dlt` state + +The LanceDB destination supports syncing of the `dlt` state. + +## Current Limitations + +Adding new fields to an existing LanceDB table requires loading the entire table data into memory as a PyArrow table. +This is because PyArrow tables are immutable, so adding fields requires creating a new table with the updated schema. + +For huge tables, this may impact performance and memory usage since the full table must be loaded into memory to add the new fields. +Keep these considerations in mind when working with large datasets and monitor memory usage if adding fields to sizable existing tables. + + + diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 1ea92f2e91..4fa1c58eae 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -116,6 +116,7 @@ const sidebars = { 'dlt-ecosystem/destinations/snowflake', 'dlt-ecosystem/destinations/athena', 'dlt-ecosystem/destinations/weaviate', + 'dlt-ecosystem/destinations/lancedb', 'dlt-ecosystem/destinations/qdrant', 'dlt-ecosystem/destinations/dremio', 'dlt-ecosystem/destinations/destination', diff --git a/poetry.lock b/poetry.lock index 894f5868bc..1543d079c2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "about-time" @@ -3552,164 +3552,6 @@ files = [ {file = "google_re2-1.1-1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c6c9f64b9724ec38da8e514f404ac64e9a6a5e8b1d7031c2dadd05c1f4c16fd"}, {file = "google_re2-1.1-1-cp39-cp39-win32.whl", hash = "sha256:d1b751b9ab9f8e2ab2a36d72b909281ce65f328c9115a1685acae1a2d1afd7a4"}, {file = "google_re2-1.1-1-cp39-cp39-win_amd64.whl", hash = "sha256:ac775c75cec7069351d201da4e0fb0cae4c1c5ebecd08fa34e1be89740c1d80b"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5eaefe4705b75ca5f78178a50104b689e9282f868e12f119b26b4cffc0c7ee6e"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:e35f2c8aabfaaa4ce6420b3cae86c0c29042b1b4f9937254347e9b985694a171"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:35fd189cbaaaa39c9a6a8a00164c8d9c709bacd0c231c694936879609beff516"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:60475d222cebd066c80414831c8a42aa2449aab252084102ee05440896586e6a"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:871cb85b9b0e1784c983b5c148156b3c5314cb29ca70432dff0d163c5c08d7e5"}, - {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94f4e66e34bdb8de91ec6cdf20ba4fa9fea1dfdcfb77ff1f59700d01a0243664"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1563577e2b720d267c4cffacc0f6a2b5c8480ea966ebdb1844fbea6602c7496f"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49b7964532a801b96062d78c0222d155873968f823a546a3dbe63d73f25bb56f"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2362fd70eb639a75fd0187d28b4ba7b20b3088833d8ad7ffd8693d0ba159e1c2"}, - {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86b80719636a4e21391e20a9adf18173ee6ae2ec956726fe2ff587417b5e8ba6"}, - {file = "google_re2-1.1-2-cp310-cp310-win32.whl", hash = "sha256:5456fba09df951fe8d1714474ed1ecda102a68ddffab0113e6c117d2e64e6f2b"}, - {file = "google_re2-1.1-2-cp310-cp310-win_amd64.whl", hash = "sha256:2ac6936a3a60d8d9de9563e90227b3aea27068f597274ca192c999a12d8baa8f"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d5a87b436028ec9b0f02fe19d4cbc19ef30441085cdfcdf1cce8fbe5c4bd5e9a"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:fc0d4163de9ed2155a77e7a2d59d94c348a6bbab3cff88922fab9e0d3d24faec"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:48b12d953bc796736e7831d67b36892fb6419a4cc44cb16521fe291e594bfe23"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:62c780c927cff98c1538439f0ff616f48a9b2e8837c676f53170d8ae5b9e83cb"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:04b2aefd768aa4edeef8b273327806c9cb0b82e90ff52eacf5d11003ac7a0db2"}, - {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9c90175992346519ee7546d9af9a64541c05b6b70346b0ddc54a48aa0d3b6554"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22ad9ad9d125249d6386a2e80efb9de7af8260b703b6be7fa0ab069c1cf56ced"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f70971f6ffe5254e476e71d449089917f50ebf9cf60f9cec80975ab1693777e2"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f267499529e64a4abed24c588f355ebe4700189d434d84a7367725f5a186e48d"}, - {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b632eff5e4cd44545a9c0e52f2e1becd55831e25f4dd4e0d7ec8ee6ca50858c1"}, - {file = "google_re2-1.1-2-cp311-cp311-win32.whl", hash = "sha256:a42c733036e8f242ee4e5f0e27153ad4ca44ced9e4ce82f3972938ddee528db0"}, - {file = "google_re2-1.1-2-cp311-cp311-win_amd64.whl", hash = "sha256:64f8eed4ca96905d99b5286b3d14b5ca4f6a025ff3c1351626a7df2f93ad1ddd"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5541efcca5b5faf7e0d882334a04fa479bad4e7433f94870f46272eec0672c4a"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:92309af35b6eb2d3b3dc57045cdd83a76370958ab3e0edd2cc4638f6d23f5b32"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:197cd9bcaba96d18c5bf84d0c32fca7a26c234ea83b1d3083366f4392cb99f78"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:1b896f171d29b541256cf26e10dccc9103ac1894683914ed88828ca6facf8dca"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:e022d3239b945014e916ca7120fee659b246ec26c301f9e0542f1a19b38a8744"}, - {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:2c73f8a9440873b68bee1198094377501065e85aaf6fcc0d2512c7589ffa06ca"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:901d86555bd7725506d651afaba7d71cd4abd13260aed6cfd7c641a45f76d4f6"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ce4710ff636701cfb56eb91c19b775d53b03749a23b7d2a5071bbbf4342a9067"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76a20e5ebdf5bc5d430530197e42a2eeb562f729d3a3fb51f39168283d676e66"}, - {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77c9f4d4bb1c8de9d2642d3c4b8b615858ba764df025b3b4f1310266f8def269"}, - {file = "google_re2-1.1-2-cp38-cp38-win32.whl", hash = "sha256:94bd60785bf37ef130a1613738e3c39465a67eae3f3be44bb918540d39b68da3"}, - {file = "google_re2-1.1-2-cp38-cp38-win_amd64.whl", hash = "sha256:59efeb77c0dcdbe37794c61f29c5b1f34bc06e8ec309a111ccdd29d380644d70"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:221e38c27e1dd9ccb8e911e9c7aed6439f68ce81e7bb74001076830b0d6e931d"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:d9145879e6c2e1b814445300b31f88a675e1f06c57564670d95a1442e8370c27"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c8a12f0740e2a52826bdbf95569a4b0abdf413b4012fa71e94ad25dd4715c6e5"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:9c9998f71466f4db7bda752aa7c348b2881ff688e361108fe500caad1d8b9cb2"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:0c39f69b702005963a3d3bf78743e1733ad73efd7e6e8465d76e3009e4694ceb"}, - {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:6d0ce762dee8d6617d0b1788a9653e805e83a23046c441d0ea65f1e27bf84114"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ecf3619d98c9b4a7844ab52552ad32597cdbc9a5bdbc7e3435391c653600d1e2"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9a1426a8cbd1fa004974574708d496005bd379310c4b1c7012be4bc75efde7a8"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1a30626ba48b4070f3eab272d860ef1952e710b088792c4d68dddb155be6bfc"}, - {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b9c1ffcfbc3095b6ff601ec2d2bf662988f6ea6763bc1c9d52bec55881f8fde"}, - {file = "google_re2-1.1-2-cp39-cp39-win32.whl", hash = "sha256:32ecf995a252c0548404c1065ba4b36f1e524f1f4a86b6367a1a6c3da3801e30"}, - {file = "google_re2-1.1-2-cp39-cp39-win_amd64.whl", hash = "sha256:e7865410f3b112a3609739283ec3f4f6f25aae827ff59c6bfdf806fd394d753e"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3b21f83f0a201009c56f06fcc7294a33555ede97130e8a91b3f4cae01aed1d73"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b38194b91354a38db1f86f25d09cdc6ac85d63aee4c67b43da3048ce637adf45"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e7da3da8d6b5a18d6c3b61b11cc5b66b8564eaedce99d2312b15b6487730fc76"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:aeca656fb10d8638f245331aabab59c9e7e051ca974b366dd79e6a9efb12e401"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:2069d6dc94f5fa14a159bf99cad2f11e9c0f8ec3b7f44a4dde9e59afe5d1c786"}, - {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:2319a39305a4931cb5251451f2582713418a19bef2af7adf9e2a7a0edd939b99"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb98fc131699756c6d86246f670a5e1c1cc1ba85413c425ad344cb30479b246c"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6e038986d8ffe4e269f8532f03009f229d1f6018d4ac0dabc8aff876338f6e0"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8618343ee658310e0f53bf586fab7409de43ce82bf8d9f7eb119536adc9783fd"}, - {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8140ca861cfe00602319cefe2c7b8737b379eb07fb328b51dc44584f47a2718"}, - {file = "google_re2-1.1-3-cp310-cp310-win32.whl", hash = "sha256:41f439c5c54e8a3a0a1fa2dbd1e809d3f643f862df7b16dd790f36a1238a272e"}, - {file = "google_re2-1.1-3-cp310-cp310-win_amd64.whl", hash = "sha256:fe20e97a33176d96d3e4b5b401de35182b9505823abea51425ec011f53ef5e56"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c39ff52b1765db039f690ee5b7b23919d8535aae94db7996079fbde0098c4d7"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5420be674fd164041639ba4c825450f3d4bd635572acdde16b3dcd697f8aa3ef"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ff53881cf1ce040f102a42d39db93c3f835f522337ae9c79839a842f26d97733"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:8d04600b0b53523118df2e413a71417c408f20dee640bf07dfab601c96a18a77"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:c4835d4849faa34a7fa1074098d81c420ed6c0707a3772482b02ce14f2a7c007"}, - {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:3309a9b81251d35fee15974d0ae0581a9a375266deeafdc3a3ac0d172a742357"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2b51cafee7e0bc72d0a4a454547bd8f257cde412ac9f1a2dc46a203b5e42cf4"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:83f5f1cb52f832c2297d271ee8c56cf5e9053448162e5d2223d513f729bad908"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55865a1ace92be3f7953b2e2b38b901d8074a367aa491daee43260a53a7fc6f0"}, - {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cec2167dd142e583e98c783bd0d28b8cf5a9cdbe1f7407ba4163fe3ccb613cb9"}, - {file = "google_re2-1.1-3-cp311-cp311-win32.whl", hash = "sha256:a0bc1fe96849e4eb8b726d0bba493f5b989372243b32fe20729cace02e5a214d"}, - {file = "google_re2-1.1-3-cp311-cp311-win_amd64.whl", hash = "sha256:e6310a156db96fc5957cb007dd2feb18476898654530683897469447df73a7cd"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8e63cd10ea006088b320e8c5d308da1f6c87aa95138a71c60dd7ca1c8e91927e"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:12b566830a334178733a85e416b1e0507dbc0ceb322827616fe51ef56c5154f1"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:442e18c9d46b225c1496919c16eafe8f8d9bb4091b00b4d3440da03c55bbf4ed"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:c54c00263a9c39b2dacd93e9636319af51e3cf885c080b9680a9631708326460"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:15a3caeeb327bc22e0c9f95eb76890fec8874cacccd2b01ff5c080ab4819bbec"}, - {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:59ec0d2cced77f715d41f6eafd901f6b15c11e28ba25fe0effdc1de554d78e75"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:185bf0e3441aed3840590f8e42f916e2920d235eb14df2cbc2049526803d3e71"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:586d3f2014eea5be14d8de53374d9b79fa99689160e00efa64b5fe93af326087"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc2575082de4ffd234d9607f3ae67ca22b15a1a88793240e2045f3b3a36a5795"}, - {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:59c5ad438eddb3630def394456091284d7bbc5b89351987f94f3792d296d1f96"}, - {file = "google_re2-1.1-3-cp312-cp312-win32.whl", hash = "sha256:5b9878c53f2bf16f75bf71d4ddd57f6611351408d5821040e91c53ebdf82c373"}, - {file = "google_re2-1.1-3-cp312-cp312-win_amd64.whl", hash = "sha256:4fdecfeb213110d0a85bad335a8e7cdb59fea7de81a4fe659233f487171980f9"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2dd87bacab32b709c28d0145fe75a956b6a39e28f0726d867375dba5721c76c1"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:55d24c61fe35dddc1bb484593a57c9f60f9e66d7f31f091ef9608ed0b6dde79f"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a0cf1180d908622df648c26b0cd09281f92129805ccc56a39227fdbfeab95cb4"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:09586f07f3f88d432265c75976da1c619ab7192cd7ebdf53f4ae0776c19e4b56"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:539f1b053402203576e919a06749198da4ae415931ee28948a1898131ae932ce"}, - {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:abf0bcb5365b0e27a5a23f3da403dffdbbac2c0e3a3f1535a8b10cc121b5d5fb"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:19c83e5bbed7958213eeac3aa71c506525ce54faf03e07d0b96cd0a764890511"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3348e77330ff672dc44ec01894fa5d93c409a532b6d688feac55e714e9059920"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:06b63edb57c5ce5a13eabfd71155e346b9477dc8906dec7c580d4f70c16a7e0d"}, - {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12fe57ba2914092b83338d61d8def9ebd5a2bd0fd8679eceb5d4c2748105d5c0"}, - {file = "google_re2-1.1-3-cp38-cp38-win32.whl", hash = "sha256:80796e08d24e606e675019fe8de4eb5c94bb765be13c384f2695247d54a6df75"}, - {file = "google_re2-1.1-3-cp38-cp38-win_amd64.whl", hash = "sha256:3c2257dedfe7cc5deb6791e563af9e071a9d414dad89e37ac7ad22f91be171a9"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43a0cd77c87c894f28969ac622f94b2e6d1571261dfdd785026848a25cfdc9b9"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:1038990b77fd66f279bd66a0832b67435ea925e15bb59eafc7b60fdec812b616"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fb5dda6875d18dd45f0f24ebced6d1f7388867c8fb04a235d1deab7ea479ce38"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bb1d164965c6d57a351b421d2f77c051403766a8b75aaa602324ee2451fff77f"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a072ebfa495051d07ffecbf6ce21eb84793568d5c3c678c00ed8ff6b8066ab31"}, - {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:4eb66c8398c8a510adc97978d944b3b29c91181237218841ea1a91dc39ec0e54"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f7c8b57b1f559553248d1757b7fa5b2e0cc845666738d155dff1987c2618264e"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9162f6aa4f25453c682eb176f21b8e2f40205be9f667e98a54b3e1ff10d6ee75"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2d65ddf67fd7bf94705626871d463057d3d9a3538d41022f95b9d8f01df36e1"}, - {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d140c7b9395b4d1e654127aa1c99bcc603ed01000b7bc7e28c52562f1894ec12"}, - {file = "google_re2-1.1-3-cp39-cp39-win32.whl", hash = "sha256:80c5fc200f64b2d903eeb07b8d6cefc620a872a0240c7caaa9aca05b20f5568f"}, - {file = "google_re2-1.1-3-cp39-cp39-win_amd64.whl", hash = "sha256:9eb6dbcee9b5dc4069bbc0634f2eb039ca524a14bed5868fdf6560aaafcbca06"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0db114d7e1aa96dbcea452a40136d7d747d60cbb61394965774688ef59cccd4e"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:82133958e003a1344e5b7a791b9a9dd7560b5c8f96936dbe16f294604524a633"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:9e74fd441d1f3d917d3303e319f61b82cdbd96b9a5ba919377a6eef1504a1e2b"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:734a2e7a4541c57253b5ebee24f3f3366ba3658bcad01da25fb623c78723471a"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:d88d5eecbc908abe16132456fae13690d0508f3ac5777f320ef95cb6cab9a961"}, - {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:b91db80b171ecec435a07977a227757dd487356701a32f556fa6fca5d0a40522"}, - {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b23129887a64bb9948af14c84705273ed1a40054e99433b4acccab4dcf6a226"}, - {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dc1a0cc7cd19261dcaf76763e2499305dbb7e51dc69555167cdb8af98782698"}, - {file = "google_re2-1.1-4-cp310-cp310-win32.whl", hash = "sha256:3b2ab1e2420b5dd9743a2d6bc61b64e5f708563702a75b6db86637837eaeaf2f"}, - {file = "google_re2-1.1-4-cp310-cp310-win_amd64.whl", hash = "sha256:92efca1a7ef83b6df012d432a1cbc71d10ff42200640c0f9a5ff5b343a48e633"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:854818fd4ce79787aca5ba459d6e5abe4ca9be2c684a5b06a7f1757452ca3708"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:4ceef51174b6f653b6659a8fdaa9c38960c5228b44b25be2a3bcd8566827554f"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:ee49087c3db7e6f5238105ab5299c09e9b77516fe8cfb0a37e5f1e813d76ecb8"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:dc2312854bdc01410acc5d935f1906a49cb1f28980341c20a68797ad89d8e178"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0dc0d2e42296fa84a3cb3e1bd667c6969389cd5cdf0786e6b1f911ae2d75375b"}, - {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6bf04ced98453b035f84320f348f67578024f44d2997498def149054eb860ae8"}, - {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d6b6ef11dc4ab322fa66c2f3561925f2b5372a879c3ed764d20e939e2fd3e5f"}, - {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0dcde6646fa9a97fd3692b3f6ae7daf7f3277d7500b6c253badeefa11db8956a"}, - {file = "google_re2-1.1-4-cp311-cp311-win32.whl", hash = "sha256:5f4f0229deb057348893574d5b0a96d055abebac6debf29d95b0c0e26524c9f6"}, - {file = "google_re2-1.1-4-cp311-cp311-win_amd64.whl", hash = "sha256:4713ddbe48a18875270b36a462b0eada5e84d6826f8df7edd328d8706b6f9d07"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:40a698300b8faddbb325662973f839489c89b960087060bd389c376828978a04"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:103d2d7ac92ba23911a151fd1fc7035cbf6dc92a7f6aea92270ebceb5cd5acd3"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:51fb7182bccab05e8258a2b6a63dda1a6b4a9e8dfb9b03ec50e50c49c2827dd4"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:65383022abd63d7b620221eba7935132b53244b8b463d8fdce498c93cf58b7b7"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396281fc68a9337157b3ffcd9392c6b7fcb8aab43e5bdab496262a81d56a4ecc"}, - {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8198adcfcff1c680e052044124621730fc48d08005f90a75487f5651f1ebfce2"}, - {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81f7bff07c448aec4db9ca453d2126ece8710dbd9278b8bb09642045d3402a96"}, - {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7dacf730fd7d6ec71b11d6404b0b26e230814bfc8e9bb0d3f13bec9b5531f8d"}, - {file = "google_re2-1.1-4-cp312-cp312-win32.whl", hash = "sha256:8c764f62f4b1d89d1ef264853b6dd9fee14a89e9b86a81bc2157fe3531425eb4"}, - {file = "google_re2-1.1-4-cp312-cp312-win_amd64.whl", hash = "sha256:0be2666df4bc5381a5d693585f9bbfefb0bfd3c07530d7e403f181f5de47254a"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:5cb1b63a0bfd8dd65d39d2f3b2e5ae0a06ce4b2ce5818a1d1fc78a786a252673"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e41751ce6b67a95230edd0772226dc94c2952a2909674cd69df9804ed0125307"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:b998cfa2d50bf4c063e777c999a7e8645ec7e5d7baf43ad71b1e2e10bb0300c3"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:226ca3b0c2e970f3fc82001ac89e845ecc7a4bb7c68583e7a76cda70b61251a7"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:9adec1f734ebad7c72e56c85f205a281d8fe9bf6583bc21020157d3f2812ce89"}, - {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:9c34f3c64ba566af967d29e11299560e6fdfacd8ca695120a7062b6ed993b179"}, - {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b85385fe293838e0d0b6e19e6c48ba8c6f739ea92ce2e23b718afe7b343363"}, - {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4694daa8a8987cfb568847aa872f9990e930c91a68c892ead876411d4b9012c3"}, - {file = "google_re2-1.1-4-cp38-cp38-win32.whl", hash = "sha256:5e671e9be1668187e2995aac378de574fa40df70bb6f04657af4d30a79274ce0"}, - {file = "google_re2-1.1-4-cp38-cp38-win_amd64.whl", hash = "sha256:f66c164d6049a8299f6dfcfa52d1580576b4b9724d6fcdad2f36f8f5da9304b6"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:25cb17ae0993a48c70596f3a3ef5d659638106401cc8193f51c0d7961b3b3eb7"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:5f101f86d14ca94ca4dcf63cceaa73d351f2be2481fcaa29d9e68eeab0dc2a88"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4e82591e85bf262a6d74cff152867e05fc97867c68ba81d6836ff8b0e7e62365"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:1f61c09b93ffd34b1e2557e5a9565039f935407a5786dbad46f64f1a484166e6"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:12b390ad8c7e74bab068732f774e75e0680dade6469b249a721f3432f90edfc3"}, - {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1284343eb31c2e82ed2d8159f33ba6842238a56782c881b07845a6d85613b055"}, - {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c7b38e0daf2c06e4d3163f4c732ab3ad2521aecfed6605b69e4482c612da303"}, - {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, - {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, - {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, ] [[package]] @@ -4486,6 +4328,42 @@ docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] +[[package]] +name = "lancedb" +version = "0.9.0" +description = "lancedb" +optional = false +python-versions = ">=3.8" +files = [ + {file = "lancedb-0.9.0-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:b1ca08797c72c93ae512aa1078f1891756da157d910fbae8e194fac3528fc1ac"}, + {file = "lancedb-0.9.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:15129791f03c2c04b95f914ced2c1556b43d73a24710207b9af77b6e4008bdeb"}, + {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f093d89447a2039b820d2540a0b64df3024e4549b6808ebd26b44fbe0345cc6"}, + {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:a8c1f6777e217d2277451038866d280fa5fb38bd161795e51703b043c26dd345"}, + {file = "lancedb-0.9.0-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:78dd5800a1148f89d33b7e98d1c8b1c42dee146f03580abc1ca83cb05273ff7f"}, + {file = "lancedb-0.9.0-cp38-abi3-win_amd64.whl", hash = "sha256:ba5bdc727d3bc131f17414f42372acde5817073feeb553793a3d20003caa1658"}, +] + +[package.dependencies] +attrs = ">=21.3.0" +cachetools = "*" +deprecation = "*" +overrides = ">=0.7" +packaging = "*" +pydantic = ">=1.10" +pylance = "0.13.0" +ratelimiter = ">=1.0,<2.0" +requests = ">=2.31.0" +retry = ">=0.9.2" +tqdm = ">=4.27.0" + +[package.extras] +azure = ["adlfs (>=2024.2.0)"] +clip = ["open-clip", "pillow", "torch"] +dev = ["pre-commit", "ruff"] +docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] +embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "ollama", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"] +tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"] + [[package]] name = "lazy-object-proxy" version = "1.9.0" @@ -4662,13 +4540,10 @@ files = [ {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"}, - {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"}, - {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"}, {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"}, - {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"}, {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"}, @@ -4677,7 +4552,6 @@ files = [ {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"}, {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"}, - {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"}, {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"}, @@ -4697,7 +4571,6 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"}, - {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"}, @@ -4707,7 +4580,6 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"}, - {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"}, @@ -4717,7 +4589,6 @@ files = [ {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"}, - {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"}, @@ -4727,7 +4598,6 @@ files = [ {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"}, {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"}, - {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"}, @@ -4738,16 +4608,13 @@ files = [ {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"}, {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"}, - {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"}, - {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"}, - {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"}, {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"}, @@ -4908,16 +4775,6 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -6916,6 +6773,32 @@ ray = ["ray[data]"] tests = ["boto3", "datasets", "duckdb", "h5py (<3.11)", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"] torch = ["torch"] +[[package]] +name = "pylance" +version = "0.13.0" +description = "python wrapper for Lance columnar format" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pylance-0.13.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2f3d6f9eec1f59f45dccb01075ba79868b8d37c8371d6210bcf6418217a0dd8b"}, + {file = "pylance-0.13.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f4861ab466c94b0f9a4b4e6de6e1dfa02f40e7242d8db87447bc7bb7d89606ac"}, + {file = "pylance-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3cb92547e145f5bfb0ea7d6f483953913b9bdd44c45bea84fc95a18da9f5853"}, + {file = "pylance-0.13.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:d1ddd7700924bc6b6b0774ea63d2aa23f9210a86cd6d6af0cdfa987df776d50d"}, + {file = "pylance-0.13.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c51d4b6e59cf4dc97c11a35b299f11e80dbdf392e2d8dc498573c26474a3c19e"}, + {file = "pylance-0.13.0-cp39-abi3-win_amd64.whl", hash = "sha256:4018ba016f1445874960a4ba2ad5c80cb380f3116683282ee8beabd38fa8989d"}, +] + +[package.dependencies] +numpy = ">=1.22" +pyarrow = ">=12,<15.0.1" + +[package.extras] +benchmarks = ["pytest-benchmark"] +dev = ["ruff (==0.4.1)"] +ray = ["ray[data]"] +tests = ["boto3", "datasets", "duckdb", "h5py (<3.11)", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"] +torch = ["torch"] + [[package]] name = "pymongo" version = "4.6.0" @@ -6953,7 +6836,6 @@ files = [ {file = "pymongo-4.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8"}, {file = "pymongo-4.6.0-cp312-cp312-win32.whl", hash = "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4"}, {file = "pymongo-4.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330"}, - {file = "pymongo-4.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e24025625bad66895b1bc3ae1647f48f0a92dd014108fb1be404c77f0b69ca67"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651"}, @@ -7394,7 +7276,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -7402,16 +7283,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -7428,7 +7301,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -7436,7 +7308,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -8360,7 +8231,6 @@ files = [ {file = "SQLAlchemy-1.4.49-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:03db81b89fe7ef3857b4a00b63dedd632d6183d4ea5a31c5d8a92e000a41fc71"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:95b9df9afd680b7a3b13b38adf6e3a38995da5e162cc7524ef08e3be4e5ed3e1"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a63e43bf3f668c11bb0444ce6e809c1227b8f067ca1068898f3008a273f52b09"}, - {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca46de16650d143a928d10842939dab208e8d8c3a9a8757600cae9b7c579c5cd"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f835c050ebaa4e48b18403bed2c0fda986525896efd76c245bdd4db995e51a4c"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c21b172dfb22e0db303ff6419451f0cac891d2e911bb9fbf8003d717f1bcf91"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-win32.whl", hash = "sha256:5fb1ebdfc8373b5a291485757bd6431de8d7ed42c27439f543c81f6c8febd729"}, @@ -8370,35 +8240,26 @@ files = [ {file = "SQLAlchemy-1.4.49-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5debe7d49b8acf1f3035317e63d9ec8d5e4d904c6e75a2a9246a119f5f2fdf3d"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win32.whl", hash = "sha256:82b08e82da3756765c2e75f327b9bf6b0f043c9c3925fb95fb51e1567fa4ee87"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win_amd64.whl", hash = "sha256:171e04eeb5d1c0d96a544caf982621a1711d078dbc5c96f11d6469169bd003f1"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f23755c384c2969ca2f7667a83f7c5648fcf8b62a3f2bbd883d805454964a800"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8396e896e08e37032e87e7fbf4a15f431aa878c286dc7f79e616c2feacdb366c"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66da9627cfcc43bbdebd47bfe0145bb662041472393c03b7802253993b6b7c90"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-win32.whl", hash = "sha256:9a06e046ffeb8a484279e54bda0a5abfd9675f594a2e38ef3133d7e4d75b6214"}, - {file = "SQLAlchemy-1.4.49-cp312-cp312-win_amd64.whl", hash = "sha256:7cf8b90ad84ad3a45098b1c9f56f2b161601e4670827d6b892ea0e884569bd1d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:36e58f8c4fe43984384e3fbe6341ac99b6b4e083de2fe838f0fdb91cebe9e9cb"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b31e67ff419013f99ad6f8fc73ee19ea31585e1e9fe773744c0f3ce58c039c30"}, - {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc22807a7e161c0d8f3da34018ab7c97ef6223578fcdd99b1d3e7ed1100a5db"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c14b29d9e1529f99efd550cd04dbb6db6ba5d690abb96d52de2bff4ed518bc95"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c40f3470e084d31247aea228aa1c39bbc0904c2b9ccbf5d3cfa2ea2dac06f26d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win32.whl", hash = "sha256:706bfa02157b97c136547c406f263e4c6274a7b061b3eb9742915dd774bbc264"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win_amd64.whl", hash = "sha256:a7f7b5c07ae5c0cfd24c2db86071fb2a3d947da7bd487e359cc91e67ac1c6d2e"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:4afbbf5ef41ac18e02c8dc1f86c04b22b7a2125f2a030e25bbb4aff31abb224b"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24e300c0c2147484a002b175f4e1361f102e82c345bf263242f0449672a4bccf"}, - {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393cd06c3b00b57f5421e2133e088df9cabcececcea180327e43b937b5a7caa5"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:201de072b818f8ad55c80d18d1a788729cccf9be6d9dc3b9d8613b053cd4836d"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653ed6817c710d0c95558232aba799307d14ae084cc9b1f4c389157ec50df5c"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win32.whl", hash = "sha256:647e0b309cb4512b1f1b78471fdaf72921b6fa6e750b9f891e09c6e2f0e5326f"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win_amd64.whl", hash = "sha256:ab73ed1a05ff539afc4a7f8cf371764cdf79768ecb7d2ec691e3ff89abbc541e"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:37ce517c011560d68f1ffb28af65d7e06f873f191eb3a73af5671e9c3fada08a"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1878ce508edea4a879015ab5215546c444233881301e97ca16fe251e89f1c55"}, - {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95ab792ca493891d7a45a077e35b418f68435efb3e1706cb8155e20e86a9013c"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e8e608983e6f85d0852ca61f97e521b62e67969e6e640fe6c6b575d4db68557"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf956da45290df6e809ea12c54c02ace7f8ff4d765d6d3dfb3655ee876ce58d"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win32.whl", hash = "sha256:f167c8175ab908ce48bd6550679cc6ea20ae169379e73c7720a28f89e53aa532"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win_amd64.whl", hash = "sha256:45806315aae81a0c202752558f0df52b42d11dd7ba0097bf71e253b4215f34f4"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:b6d0c4b15d65087738a6e22e0ff461b407533ff65a73b818089efc8eb2b3e1de"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a843e34abfd4c797018fd8d00ffffa99fd5184c421f190b6ca99def4087689bd"}, - {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738d7321212941ab19ba2acf02a68b8ee64987b248ffa2101630e8fccb549e0d"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c890421651b45a681181301b3497e4d57c0d01dc001e10438a40e9a9c25ee77"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26f280b8f0a8f497bc10573849ad6dc62e671d2468826e5c748d04ed9e670d5"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-win32.whl", hash = "sha256:ec2268de67f73b43320383947e74700e95c6770d0c68c4e615e9897e46296294"}, @@ -9569,6 +9430,7 @@ duckdb = ["duckdb"] filesystem = ["botocore", "s3fs"] gcp = ["gcsfs", "google-cloud-bigquery", "grpcio"] gs = ["gcsfs"] +lancedb = ["lancedb", "pyarrow"] motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] @@ -9583,4 +9445,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "920869be38d3b82c2c62e77d814c43be62b71ed73d68d1c570ac70886d439b91" +content-hash = "9979b133732a91a49fe3014afde0f5e3455cbc26a129aecad6171672c3f0f4a9" diff --git a/pyproject.toml b/pyproject.toml index c7cda5a994..cd4d6a78da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ qdrant-client = {version = ">=1.8", optional = true, extras = ["fastembed"]} databricks-sql-connector = {version = ">=2.9.3", optional = true} clickhouse-driver = { version = ">=0.2.7", optional = true } clickhouse-connect = { version = ">=0.7.7", optional = true } +lancedb = { version = ">=0.8.2", optional = true, markers = "python_version >= '3.9'" } deltalake = { version = ">=0.17.4", optional = true } [tool.poetry.extras] @@ -103,8 +104,10 @@ qdrant = ["qdrant-client"] databricks = ["databricks-sql-connector"] clickhouse = ["clickhouse-driver", "clickhouse-connect", "s3fs", "gcsfs", "adlfs", "pyarrow"] dremio = ["pyarrow"] +lancedb = ["lancedb", "pyarrow"] deltalake = ["deltalake", "pyarrow"] + [tool.poetry.scripts] dlt = "dlt.cli._dlt:_main" diff --git a/tests/load/lancedb/__init__.py b/tests/load/lancedb/__init__.py new file mode 100644 index 0000000000..fb4bf0b35d --- /dev/null +++ b/tests/load/lancedb/__init__.py @@ -0,0 +1,3 @@ +from tests.utils import skip_if_not_active + +skip_if_not_active("lancedb") diff --git a/tests/load/lancedb/test_config.py b/tests/load/lancedb/test_config.py new file mode 100644 index 0000000000..c1d658d4fe --- /dev/null +++ b/tests/load/lancedb/test_config.py @@ -0,0 +1,35 @@ +import os +from typing import Iterator + +import pytest + +from dlt.common.configuration import resolve_configuration +from dlt.common.utils import digest128 +from dlt.destinations.impl.lancedb.configuration import ( + LanceDBClientConfiguration, +) +from tests.load.utils import ( + drop_active_pipeline_data, +) + + +# Mark all tests as essential, do not remove. +pytestmark = pytest.mark.essential + + +@pytest.fixture(autouse=True) +def drop_lancedb_data() -> Iterator[None]: + yield + drop_active_pipeline_data() + + +def test_lancedb_configuration() -> None: + os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL_PROVIDER"] = "colbert" + os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL"] = "text-embedding-3-small" + + config = resolve_configuration( + LanceDBClientConfiguration()._bind_dataset_name(dataset_name="dataset"), + sections=("destination", "lancedb"), + ) + assert config.embedding_model_provider == "colbert" + assert config.embedding_model == "text-embedding-3-small" diff --git a/tests/load/lancedb/test_pipeline.py b/tests/load/lancedb/test_pipeline.py new file mode 100644 index 0000000000..a89153f629 --- /dev/null +++ b/tests/load/lancedb/test_pipeline.py @@ -0,0 +1,435 @@ +from typing import Iterator, Generator, Any, List + +import pytest + +import dlt +from dlt.common import json +from dlt.common.typing import DictStrStr, DictStrAny +from dlt.common.utils import uniq_id +from dlt.destinations.impl.lancedb.lancedb_adapter import ( + lancedb_adapter, + VECTORIZE_HINT, +) +from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient +from tests.load.lancedb.utils import assert_table +from tests.load.utils import sequence_generator, drop_active_pipeline_data +from tests.pipeline.utils import assert_load_info + + +# Mark all tests as essential, do not remove. +pytestmark = pytest.mark.essential + + +@pytest.fixture(autouse=True) +def drop_lancedb_data() -> Iterator[None]: + yield + drop_active_pipeline_data() + + +def test_adapter_and_hints() -> None: + generator_instance1 = sequence_generator() + + @dlt.resource(columns=[{"name": "content", "data_type": "text"}]) + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(generator_instance1) + + assert some_data.columns["content"] == {"name": "content", "data_type": "text"} # type: ignore[index] + + lancedb_adapter( + some_data, + embed=["content"], + ) + + assert some_data.columns["content"] == { # type: ignore + "name": "content", + "data_type": "text", + "x-lancedb-embed": True, + } + + +def test_basic_state_and_schema() -> None: + generator_instance1 = sequence_generator() + + @dlt.resource + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(generator_instance1) + + lancedb_adapter( + some_data, + embed=["content"], + ) + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_append", + destination="lancedb", + dataset_name=f"test_pipeline_append_dataset{uniq_id()}", + ) + info = pipeline.run( + some_data(), + ) + assert_load_info(info) + + client: LanceDBClient + with pipeline.destination_client() as client: # type: ignore + # Check if we can get a stored schema and state. + schema = client.get_stored_schema() + print("Print dataset name", client.dataset_name) + assert schema + state = client.get_stored_state("test_pipeline_append") + assert state + + +def test_pipeline_append() -> None: + generator_instance1 = sequence_generator() + generator_instance2 = sequence_generator() + + @dlt.resource + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(generator_instance1) + + lancedb_adapter( + some_data, + embed=["content"], + ) + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_append", + destination="lancedb", + dataset_name=f"TestPipelineAppendDataset{uniq_id()}", + ) + info = pipeline.run( + some_data(), + ) + assert_load_info(info) + + data = next(generator_instance2) + assert_table(pipeline, "some_data", items=data) + + info = pipeline.run( + some_data(), + ) + assert_load_info(info) + + data.extend(next(generator_instance2)) + assert_table(pipeline, "some_data", items=data) + + +def test_explicit_append() -> None: + """Append should work even when the primary key is specified.""" + data = [ + {"doc_id": 1, "content": "1"}, + {"doc_id": 2, "content": "2"}, + {"doc_id": 3, "content": "3"}, + ] + + @dlt.resource(primary_key="doc_id") + def some_data() -> Generator[List[DictStrAny], Any, None]: + yield data + + lancedb_adapter( + some_data, + embed=["content"], + ) + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_append", + destination="lancedb", + dataset_name=f"TestPipelineAppendDataset{uniq_id()}", + ) + info = pipeline.run( + some_data(), + ) + + assert_table(pipeline, "some_data", items=data) + + info = pipeline.run( + some_data(), + write_disposition="append", + ) + assert_load_info(info) + + data.extend(data) + assert_table(pipeline, "some_data", items=data) + + +def test_pipeline_replace() -> None: + generator_instance1 = sequence_generator() + generator_instance2 = sequence_generator() + + @dlt.resource + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(generator_instance1) + + lancedb_adapter( + some_data, + embed=["content"], + ) + + uid = uniq_id() + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_replace", + destination="lancedb", + dataset_name="test_pipeline_replace_dataset" + + uid, # lancedb doesn't mandate any name normalization + ) + + info = pipeline.run( + some_data(), + write_disposition="replace", + ) + assert_load_info(info) + assert info.dataset_name == f"test_pipeline_replace_dataset{uid}" + + data = next(generator_instance2) + assert_table(pipeline, "some_data", items=data) + + info = pipeline.run( + some_data(), + write_disposition="replace", + ) + assert_load_info(info) + + data = next(generator_instance2) + assert_table(pipeline, "some_data", items=data) + + +def test_pipeline_merge() -> None: + data = [ + { + "doc_id": 1, + "merge_id": "shawshank-redemption-1994", + "title": "The Shawshank Redemption", + "description": ( + "Two imprisoned men find redemption through acts of decency over the years." + ), + }, + { + "doc_id": 2, + "merge_id": "the-godfather-1972", + "title": "The Godfather", + "description": ( + "A crime dynasty's aging patriarch transfers control to his reluctant son." + ), + }, + { + "doc_id": 3, + "merge_id": "the-dark-knight-2008", + "title": "The Dark Knight", + "description": ( + "The Joker wreaks havoc on Gotham, challenging The Dark Knight's ability to fight" + " injustice." + ), + }, + { + "doc_id": 4, + "merge_id": "pulp-fiction-1994", + "title": "Pulp Fiction", + "description": ( + "The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner" + " bandits intertwine in four tales of violence and redemption." + ), + }, + { + "doc_id": 5, + "merge_id": "schindlers-list-1993", + "title": "Schindler's List", + "description": ( + "In German-occupied Poland during World War II, industrialist Oskar Schindler" + " gradually becomes concerned for his Jewish workforce after witnessing their" + " persecution by the Nazis." + ), + }, + { + "doc_id": 6, + "merge_id": "the-lord-of-the-rings-the-return-of-the-king-2003", + "title": "The Lord of the Rings: The Return of the King", + "description": ( + "Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze" + " from Frodo and Sam as they approach Mount Doom with the One Ring." + ), + }, + { + "doc_id": 7, + "merge_id": "the-matrix-1999", + "title": "The Matrix", + "description": ( + "A computer hacker learns from mysterious rebels about the true nature of his" + " reality and his role in the war against its controllers." + ), + }, + ] + + @dlt.resource(primary_key="doc_id") + def movies_data() -> Any: + yield data + + @dlt.resource(primary_key="doc_id", merge_key=["merge_id", "title"]) + def movies_data_explicit_merge_keys() -> Any: + yield data + + lancedb_adapter( + movies_data, + embed=["description"], + ) + + lancedb_adapter( + movies_data_explicit_merge_keys, + embed=["description"], + ) + + pipeline = dlt.pipeline( + pipeline_name="movies", + destination="lancedb", + dataset_name=f"TestPipelineAppendDataset{uniq_id()}", + ) + info = pipeline.run( + movies_data(), + write_disposition="merge", + dataset_name=f"MoviesDataset{uniq_id()}", + ) + assert_load_info(info) + assert_table(pipeline, "movies_data", items=data) + + # Change some data. + data[0]["title"] = "The Shawshank Redemption 2" + + info = pipeline.run( + movies_data(), + write_disposition="merge", + ) + assert_load_info(info) + assert_table(pipeline, "movies_data", items=data) + + info = pipeline.run( + movies_data(), + write_disposition="merge", + ) + assert_load_info(info) + assert_table(pipeline, "movies_data", items=data) + + # Test with explicit merge keys. + info = pipeline.run( + movies_data_explicit_merge_keys(), + write_disposition="merge", + ) + assert_load_info(info) + assert_table(pipeline, "movies_data_explicit_merge_keys", items=data) + + +def test_pipeline_with_schema_evolution() -> None: + data = [ + { + "doc_id": 1, + "content": "1", + }, + { + "doc_id": 2, + "content": "2", + }, + ] + + @dlt.resource() + def some_data() -> Generator[List[DictStrAny], Any, None]: + yield data + + lancedb_adapter(some_data, embed=["content"]) + + pipeline = dlt.pipeline( + pipeline_name="test_pipeline_append", + destination="lancedb", + dataset_name=f"TestSchemaEvolutionDataset{uniq_id()}", + ) + pipeline.run( + some_data(), + ) + + assert_table(pipeline, "some_data", items=data) + + aggregated_data = data.copy() + + data = [ + { + "doc_id": 3, + "content": "3", + "new_column": "new", + }, + { + "doc_id": 4, + "content": "4", + "new_column": "new", + }, + ] + + pipeline.run( + some_data(), + ) + + table_schema = pipeline.default_schema.tables["some_data"] + assert "new_column" in table_schema["columns"] + + aggregated_data.extend(data) + + assert_table(pipeline, "some_data", items=aggregated_data) + + +def test_merge_github_nested() -> None: + pipe = dlt.pipeline(destination="lancedb", dataset_name="github1", full_refresh=True) + assert pipe.dataset_name.startswith("github1_202") + + with open( + "tests/normalize/cases/github.issues.load_page_5_duck.json", + "r", + encoding="utf-8", + ) as f: + data = json.load(f) + + info = pipe.run( + lancedb_adapter(data[:17], embed=["title", "body"]), + table_name="issues", + write_disposition="merge", + primary_key="id", + ) + assert_load_info(info) + # assert if schema contains tables with right names + print(pipe.default_schema.tables.keys()) + assert set(pipe.default_schema.tables.keys()) == { + "_dlt_version", + "_dlt_loads", + "issues", + "_dlt_pipeline_state", + "issues__labels", + "issues__assignees", + } + assert {t["name"] for t in pipe.default_schema.data_tables()} == { + "issues", + "issues__labels", + "issues__assignees", + } + assert {t["name"] for t in pipe.default_schema.dlt_tables()} == { + "_dlt_version", + "_dlt_loads", + "_dlt_pipeline_state", + } + issues = pipe.default_schema.tables["issues"] + assert issues["columns"]["id"]["primary_key"] is True + # Make sure vectorization is enabled for. + assert issues["columns"]["title"][VECTORIZE_HINT] # type: ignore[literal-required] + assert issues["columns"]["body"][VECTORIZE_HINT] # type: ignore[literal-required] + assert VECTORIZE_HINT not in issues["columns"]["url"] + assert_table(pipe, "issues", expected_items_count=17) + + +def test_empty_dataset_allowed() -> None: + # dataset_name is optional so dataset name won't be autogenerated when not explicitly passed. + pipe = dlt.pipeline(destination="lancedb", full_refresh=True) + client: LanceDBClient = pipe.destination_client() # type: ignore[assignment] + + assert pipe.dataset_name is None + info = pipe.run(lancedb_adapter(["context", "created", "not a stop word"], embed=["value"])) + # Dataset in load info is empty. + assert info.dataset_name is None + client = pipe.destination_client() # type: ignore[assignment] + assert client.dataset_name is None + assert client.sentinel_table == "dltSentinelTable" + assert_table(pipe, "content", expected_items_count=3) diff --git a/tests/load/lancedb/utils.py b/tests/load/lancedb/utils.py new file mode 100644 index 0000000000..dc3ea5304b --- /dev/null +++ b/tests/load/lancedb/utils.py @@ -0,0 +1,74 @@ +from typing import Union, List, Any, Dict + +import numpy as np +from lancedb.embeddings import TextEmbeddingFunction # type: ignore + +import dlt +from dlt.destinations.impl.lancedb.lancedb_client import LanceDBClient + + +def assert_unordered_dicts_equal( + dict_list1: List[Dict[str, Any]], dict_list2: List[Dict[str, Any]] +) -> None: + """ + Assert that two lists of dictionaries contain the same dictionaries, ignoring None values. + + Args: + dict_list1 (List[Dict[str, Any]]): The first list of dictionaries to compare. + dict_list2 (List[Dict[str, Any]]): The second list of dictionaries to compare. + + Raises: + AssertionError: If the lists have different lengths or contain different dictionaries. + """ + assert len(dict_list1) == len(dict_list2), "Lists have different length" + + dict_set1 = {tuple(sorted((k, v) for k, v in d.items() if v is not None)) for d in dict_list1} + dict_set2 = {tuple(sorted((k, v) for k, v in d.items() if v is not None)) for d in dict_list2} + + assert dict_set1 == dict_set2, "Lists contain different dictionaries" + + +def assert_table( + pipeline: dlt.Pipeline, + table_name: str, + expected_items_count: int = None, + items: List[Any] = None, +) -> None: + client: LanceDBClient = pipeline.destination_client() # type: ignore[assignment] + qualified_table_name = client.make_qualified_table_name(table_name) + + exists = client.table_exists(qualified_table_name) + assert exists + + records = client.db_client.open_table(qualified_table_name).search().limit(50).to_list() + + if expected_items_count is not None: + assert expected_items_count == len(records) + + if items is None: + return + + drop_keys = [ + "_dlt_id", + "_dlt_load_id", + dlt.config.get("destination.lancedb.credentials.id_field_name", str) or "id__", + dlt.config.get("destination.lancedb.credentials.vector_field_name", str) or "vector__", + ] + objects_without_dlt_or_special_keys = [ + {k: v for k, v in record.items() if k not in drop_keys} for record in records + ] + + assert_unordered_dicts_equal(objects_without_dlt_or_special_keys, items) + + +class MockEmbeddingFunc(TextEmbeddingFunction): + def generate_embeddings( + self, + texts: Union[List[str], np.ndarray], # type: ignore[type-arg] + *args, + **kwargs, + ) -> List[np.ndarray]: # type: ignore[type-arg] + return [np.array(None)] + + def ndims(self) -> int: + return 2 diff --git a/tests/load/qdrant/test_pipeline.py b/tests/load/qdrant/test_pipeline.py index e0bd1fff97..e0cb9dab84 100644 --- a/tests/load/qdrant/test_pipeline.py +++ b/tests/load/qdrant/test_pipeline.py @@ -319,8 +319,8 @@ def test_merge_github_nested() -> None: primary_key="id", ) assert_load_info(info) + # assert if schema contains tables with right names - print(p.default_schema.tables.keys()) assert set(p.default_schema.tables.keys()) == { "_dlt_version", "_dlt_loads", diff --git a/tests/load/utils.py b/tests/load/utils.py index 8c6446b921..00ed4e3bf3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -276,8 +276,10 @@ def destinations_configs( assert set(SQL_DESTINATIONS) == {d.destination for d in destination_configs} if default_vector_configs: - # for now only weaviate - destination_configs += [DestinationTestConfiguration(destination="weaviate")] + destination_configs += [ + DestinationTestConfiguration(destination="weaviate"), + DestinationTestConfiguration(destination="lancedb"), + ] if default_staging_configs or all_staging_configs: destination_configs += [ diff --git a/tests/utils.py b/tests/utils.py index 47b6561c8e..bf3aafdb77 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -45,13 +45,22 @@ "motherduck", "mssql", "qdrant", + "lancedb", "destination", "synapse", "databricks", "clickhouse", "dremio", } -NON_SQL_DESTINATIONS = {"filesystem", "weaviate", "dummy", "motherduck", "qdrant", "destination"} +NON_SQL_DESTINATIONS = { + "filesystem", + "weaviate", + "dummy", + "motherduck", + "qdrant", + "lancedb", + "destination", +} SQL_DESTINATIONS = IMPLEMENTED_DESTINATIONS - NON_SQL_DESTINATIONS # exclude destination configs (for now used for athena and athena iceberg separation) From af7752725b34ff9fa6f081f1c2483ac767944284 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Thu, 27 Jun 2024 18:55:01 +0200 Subject: [PATCH 42/61] Feat/simplifies naming convention writing (#1523) * adds naming convention example * improves naming convention docs * simplifies naming convention classes and configurations, implements sql cs, adds tests * bumps to version 0.5.1a0 * linter fixes * format fixes --- .github/workflows/test_doc_snippets.yml | 20 ++ Makefile | 4 +- .../configuration/specs/run_configuration.py | 4 +- dlt/common/destination/capabilities.py | 6 +- dlt/common/normalizers/configuration.py | 24 +- dlt/common/normalizers/naming/direct.py | 20 +- dlt/common/normalizers/naming/duck_case.py | 15 +- dlt/common/normalizers/naming/exceptions.py | 15 +- dlt/common/normalizers/naming/naming.py | 35 ++- dlt/common/normalizers/naming/snake_case.py | 53 +++-- dlt/common/normalizers/naming/sql_ci_v1.py | 10 +- dlt/common/normalizers/naming/sql_cs_v1.py | 42 +++- dlt/common/normalizers/typing.py | 7 +- dlt/common/normalizers/utils.py | 143 ++++++++---- dlt/common/schema/migrations.py | 3 +- dlt/common/schema/schema.py | 23 +- dlt/destinations/impl/weaviate/ci_naming.py | 9 +- dlt/destinations/impl/weaviate/naming.py | 14 +- docs/examples/conftest.py | 7 +- docs/examples/custom_naming/.dlt/config.toml | 2 + docs/examples/custom_naming/__init__.py | 0 docs/examples/custom_naming/custom_naming.py | 90 ++++++++ .../custom_naming/sql_ci_no_collision.py | 34 +++ docs/examples/custom_naming/sql_cs_latin2.py | 21 ++ .../postgres_to_postgres.py | 3 +- docs/tools/prepare_examples_tests.py | 12 +- .../docs/general-usage/naming-convention.md | 79 ++++--- poetry.lock | 216 +++++++++++++++++- pyproject.toml | 3 +- .../{ => cases}/normalizers/snake_no_x.py | 0 tests/common/cases/normalizers/sql_upper.py | 10 +- tests/common/cases/normalizers/title_case.py | 3 +- .../common/normalizers/custom_normalizers.py | 7 + .../normalizers/test_import_normalizers.py | 134 ++++++++++- tests/common/normalizers/test_naming.py | 56 ++++- .../normalizers/test_naming_snake_case.py | 29 +-- tests/common/normalizers/test_naming_sql.py | 55 +++++ .../schema/test_normalize_identifiers.py | 29 ++- 38 files changed, 972 insertions(+), 265 deletions(-) create mode 100644 docs/examples/custom_naming/.dlt/config.toml create mode 100644 docs/examples/custom_naming/__init__.py create mode 100644 docs/examples/custom_naming/custom_naming.py create mode 100644 docs/examples/custom_naming/sql_ci_no_collision.py create mode 100644 docs/examples/custom_naming/sql_cs_latin2.py rename tests/common/{ => cases}/normalizers/snake_no_x.py (100%) create mode 100644 tests/common/normalizers/test_naming_sql.py diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index 2c51695714..6d4e6dda53 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -32,6 +32,26 @@ jobs: # Do not run on forks, unless allowed, secrets are used here if: ${{ !github.event.pull_request.head.repo.fork || contains(github.event.pull_request.labels.*.name, 'ci from fork')}} + # Service containers to run with `container-job` + services: + # Label used to access the service container + postgres: + # Docker Hub image + image: postgres + # Provide the password for postgres + env: + POSTGRES_DB: dlt_data + POSTGRES_USER: loader + POSTGRES_PASSWORD: loader + ports: + - 5432:5432 + # Set health checks to wait until postgres has started + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: - name: Check out diff --git a/Makefile b/Makefile index fd0920d188..15fb895a9f 100644 --- a/Makefile +++ b/Makefile @@ -67,9 +67,9 @@ lint-and-test-snippets: cd docs/website/docs && poetry run pytest --ignore=node_modules lint-and-test-examples: - poetry run mypy --config-file mypy.ini docs/examples - poetry run flake8 --max-line-length=200 docs/examples cd docs/tools && poetry run python prepare_examples_tests.py + poetry run flake8 --max-line-length=200 docs/examples + poetry run mypy --config-file mypy.ini docs/examples cd docs/examples && poetry run pytest diff --git a/dlt/common/configuration/specs/run_configuration.py b/dlt/common/configuration/specs/run_configuration.py index 6833b7678d..dcb78683fb 100644 --- a/dlt/common/configuration/specs/run_configuration.py +++ b/dlt/common/configuration/specs/run_configuration.py @@ -17,7 +17,9 @@ class RunConfiguration(BaseConfiguration): dlthub_telemetry: bool = True # enable or disable dlthub telemetry dlthub_telemetry_endpoint: Optional[str] = "https://telemetry.scalevector.ai" dlthub_telemetry_segment_write_key: Optional[str] = None - log_format: str = "{asctime}|[{levelname}]|{process}|{thread}|{name}|{filename}|{funcName}:{lineno}|{message}" + log_format: str = ( + "{asctime}|[{levelname}]|{process}|{thread}|{name}|{filename}|{funcName}:{lineno}|{message}" + ) log_level: str = "WARNING" request_timeout: float = 60 """Timeout for http requests""" diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index f28065782a..595d3e0d26 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -8,9 +8,9 @@ Tuple, Set, Protocol, - Union, get_args, ) +from dlt.common.normalizers.typing import TNamingConventionReferenceArg from dlt.common.typing import TLoaderFileFormat from dlt.common.configuration.utils import serialize_value from dlt.common.configuration import configspec @@ -74,7 +74,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): supports_transactions: bool = None supports_ddl_transactions: bool = None # use naming convention in the schema - naming_convention: Union[str, NamingConvention] = None + naming_convention: TNamingConventionReferenceArg = None alter_add_multi_column: bool = True supports_truncate_command: bool = True schema_supports_numeric_precision: bool = True @@ -99,7 +99,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): @staticmethod def generic_capabilities( preferred_loader_file_format: TLoaderFileFormat = None, - naming_convention: Union[str, NamingConvention] = None, + naming_convention: TNamingConventionReferenceArg = None, loader_file_format_adapter: LoaderFileFormatAdapter = None, supported_table_formats: Sequence["TTableFormat"] = None, # type: ignore[name-defined] # noqa: F821 ) -> "DestinationCapabilitiesContext": diff --git a/dlt/common/normalizers/configuration.py b/dlt/common/normalizers/configuration.py index 4e9d9c4a20..6011ba4774 100644 --- a/dlt/common/normalizers/configuration.py +++ b/dlt/common/normalizers/configuration.py @@ -1,9 +1,8 @@ -from typing import ClassVar, Optional, Union +from typing import ClassVar, Optional from dlt.common.configuration import configspec from dlt.common.configuration.specs import BaseConfiguration, known_sections -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.normalizers.naming import NamingConvention +from dlt.common.normalizers.typing import TNamingConventionReferenceArg from dlt.common.typing import DictStrAny @@ -12,23 +11,6 @@ class NormalizersConfiguration(BaseConfiguration): # always in section __section__: ClassVar[str] = known_sections.SCHEMA - naming: Optional[Union[str, NamingConvention]] = None + naming: Optional[TNamingConventionReferenceArg] = None # Union[str, NamingConvention] json_normalizer: Optional[DictStrAny] = None allow_identifier_change_on_table_with_data: Optional[bool] = None - destination_capabilities: Optional[DestinationCapabilitiesContext] = None # injectable - - def on_resolved(self) -> None: - # get naming from capabilities if not present - if self.naming is None: - if self.destination_capabilities: - self.naming = self.destination_capabilities.naming_convention - # if max_table_nesting is set, we need to set the max_table_nesting in the json_normalizer - if ( - self.destination_capabilities - and self.destination_capabilities.max_table_nesting is not None - ): - self.json_normalizer = self.json_normalizer or {} - self.json_normalizer.setdefault("config", {}) - self.json_normalizer["config"][ - "max_nesting" - ] = self.destination_capabilities.max_table_nesting diff --git a/dlt/common/normalizers/naming/direct.py b/dlt/common/normalizers/naming/direct.py index c164e28365..fc146dbc4c 100644 --- a/dlt/common/normalizers/naming/direct.py +++ b/dlt/common/normalizers/naming/direct.py @@ -1,19 +1,23 @@ -from typing import Any, Sequence +from typing import ClassVar from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention class NamingConvention(BaseNamingConvention): - PATH_SEPARATOR = "▶" - _CLEANUP_TABLE = str.maketrans(".\n\r'\"▶", "______") + """Case sensitive naming convention that maps source identifiers to destination identifiers with + only minimal changes. New line characters, double and single quotes are replaced with underscores. + + Uses ▶ as path separator. + """ + + PATH_SEPARATOR: ClassVar[str] = "▶" + _CLEANUP_TABLE = str.maketrans("\n\r'\"▶", "_____") def normalize_identifier(self, identifier: str) -> str: identifier = super().normalize_identifier(identifier) norm_identifier = identifier.translate(self._CLEANUP_TABLE) return self.shorten_identifier(norm_identifier, identifier, self.max_length) - def make_path(self, *identifiers: Any) -> str: - return self.PATH_SEPARATOR.join(filter(lambda x: x.strip(), identifiers)) - - def break_path(self, path: str) -> Sequence[str]: - return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] + @property + def is_case_sensitive(self) -> bool: + return True diff --git a/dlt/common/normalizers/naming/duck_case.py b/dlt/common/normalizers/naming/duck_case.py index 77c41e0e43..3801660ba8 100644 --- a/dlt/common/normalizers/naming/duck_case.py +++ b/dlt/common/normalizers/naming/duck_case.py @@ -5,14 +5,15 @@ class NamingConvention(SnakeCaseNamingConvention): + """Case sensitive naming convention preserving all unicode characters except new line(s). Uses __ for path + separation and will replace multiple underscores with a single one. + """ + _CLEANUP_TABLE = str.maketrans('\n\r"', "___") - def __init__(self, max_length: int = None) -> None: - """Case sensitive naming convention preserving all unicode characters except new line(s). Uses __ for path - separation and will replace multiple underscores with a single one. - """ - super().__init__(max_length) - self.is_case_sensitive = True + @property + def is_case_sensitive(self) -> bool: + return True @staticmethod @lru_cache(maxsize=None) @@ -23,5 +24,5 @@ def _normalize_identifier(identifier: str, max_length: int) -> str: # shorten identifier return NamingConvention.shorten_identifier( - NamingConvention._RE_UNDERSCORES.sub("_", normalized_ident), identifier, max_length + NamingConvention.RE_UNDERSCORES.sub("_", normalized_ident), identifier, max_length ) diff --git a/dlt/common/normalizers/naming/exceptions.py b/dlt/common/normalizers/naming/exceptions.py index d8448fa1e0..0b22ae2dd5 100644 --- a/dlt/common/normalizers/naming/exceptions.py +++ b/dlt/common/normalizers/naming/exceptions.py @@ -11,11 +11,22 @@ def __init__(self, naming_module: str) -> None: if "." in naming_module: msg = f"Naming module {naming_module} could not be found and imported" else: - msg = f"Naming module {naming_module} is not one of the standard dlt naming conventions" + msg = ( + f"Naming module {naming_module} is not one of the standard dlt naming conventions" + " and could not be locally imported" + ) super().__init__(msg) -class InvalidNamingModule(NormalizersException): +class NamingTypeNotFound(ImportError, NormalizersException): + def __init__(self, naming_module: str, naming_class: str) -> None: + self.naming_module = naming_module + self.naming_class = naming_class + msg = f"In naming module '{naming_module}' type '{naming_class}' does not exist" + super().__init__(msg) + + +class InvalidNamingType(NormalizersException): def __init__(self, naming_module: str, naming_class: str) -> None: self.naming_module = naming_module self.naming_class = naming_class diff --git a/dlt/common/normalizers/naming/naming.py b/dlt/common/normalizers/naming/naming.py index b806f11eec..5ae5847963 100644 --- a/dlt/common/normalizers/naming/naming.py +++ b/dlt/common/normalizers/naming/naming.py @@ -3,19 +3,27 @@ from functools import lru_cache import math import hashlib -from typing import Sequence +from typing import Sequence, ClassVar class NamingConvention(ABC): - _TR_TABLE = bytes.maketrans(b"/+", b"ab") - _DEFAULT_COLLISION_PROB = 0.001 + """Initializes naming convention to generate identifier with `max_length` if specified. Base naming convention + is case sensitive by default + """ + + _TR_TABLE: ClassVar[bytes] = bytes.maketrans(b"/+", b"ab") + _DEFAULT_COLLISION_PROB: ClassVar[float] = 0.001 + PATH_SEPARATOR: ClassVar[str] = "__" + """Subsequent nested fields will be separated with the string below, applies both to field and table names""" def __init__(self, max_length: int = None) -> None: - """Initializes naming convention to generate identifier with `max_length` if specified. Base naming convention - is case sensitive by default - """ self.max_length = max_length - self.is_case_sensitive = True + + @property + @abstractmethod + def is_case_sensitive(self) -> bool: + """Tells if given naming convention is producing case insensitive or case sensitive identifiers.""" + pass @abstractmethod def normalize_identifier(self, identifier: str) -> str: @@ -31,15 +39,13 @@ def normalize_table_identifier(self, identifier: str) -> str: """Normalizes and shortens identifier that will function as a dataset, table or schema name, defaults to `normalize_identifier`""" return self.normalize_identifier(identifier) - @abstractmethod def make_path(self, *identifiers: str) -> str: """Builds path out of identifiers. Identifiers are neither normalized nor shortened""" - pass + return self.PATH_SEPARATOR.join(filter(lambda x: x.strip(), identifiers)) - @abstractmethod def break_path(self, path: str) -> Sequence[str]: """Breaks path into sequence of identifiers""" - pass + return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] def normalize_path(self, path: str) -> str: """Breaks path into identifiers, normalizes components, reconstitutes and shortens the path""" @@ -70,6 +76,13 @@ def name(cls) -> str: return cls.__module__.split(".")[-1] return cls.__module__ + def __str__(self) -> str: + name = self.name() + name += "_cs" if self.is_case_sensitive else "_ci" + if self.max_length: + name += f"_{self.max_length}" + return name + @staticmethod @lru_cache(maxsize=None) def shorten_identifier( diff --git a/dlt/common/normalizers/naming/snake_case.py b/dlt/common/normalizers/naming/snake_case.py index 7ff9259745..d38841a238 100644 --- a/dlt/common/normalizers/naming/snake_case.py +++ b/dlt/common/normalizers/naming/snake_case.py @@ -1,49 +1,54 @@ import re -from typing import Sequence from functools import lru_cache +from typing import ClassVar from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention +from dlt.common.normalizers.naming.sql_cs_v1 import ( + RE_UNDERSCORES, + RE_LEADING_DIGITS, + RE_NON_ALPHANUMERIC, +) +from dlt.common.typing import REPattern class NamingConvention(BaseNamingConvention): - _RE_UNDERSCORES = re.compile("__+") - _RE_LEADING_DIGITS = re.compile(r"^\d+") - # _RE_ENDING_UNDERSCORES = re.compile(r"_+$") - _RE_NON_ALPHANUMERIC = re.compile(r"[^a-zA-Z\d_]+") + """Case insensitive naming convention, converting source identifiers into lower case snake case with reduced alphabet. + + - Spaces around identifier are trimmed + - Removes all ascii characters except ascii alphanumerics and underscores + - Prepends `_` if name starts with number. + - Multiples of `_` are converted into single `_`. + - Replaces all trailing `_` with `x` + - Replaces `+` and `*` with `x`, `-` with `_`, `@` with `a` and `|` with `l` + + Uses __ as patent-child separator for tables and flattened column names. + """ + + RE_UNDERSCORES: ClassVar[REPattern] = RE_UNDERSCORES + RE_LEADING_DIGITS: ClassVar[REPattern] = RE_LEADING_DIGITS + RE_NON_ALPHANUMERIC: ClassVar[REPattern] = RE_NON_ALPHANUMERIC + _SNAKE_CASE_BREAK_1 = re.compile("([^_])([A-Z][a-z]+)") _SNAKE_CASE_BREAK_2 = re.compile("([a-z0-9])([A-Z])") _REDUCE_ALPHABET = ("+-*@|", "x_xal") _TR_REDUCE_ALPHABET = str.maketrans(_REDUCE_ALPHABET[0], _REDUCE_ALPHABET[1]) - # subsequent nested fields will be separated with the string below, applies both to field and table names - PATH_SEPARATOR = "__" - - def __init__(self, max_length: int = None) -> None: - """Case insensitive naming convention, converting source identifiers into snake case. Uses __ as path separator. - Multiple underscores are contracted to one. - """ - super().__init__(max_length) - self.is_case_sensitive = False + @property + def is_case_sensitive(self) -> bool: + return False def normalize_identifier(self, identifier: str) -> str: identifier = super().normalize_identifier(identifier) # print(f"{identifier} -> {self.shorten_identifier(identifier, self.max_length)} ({self.max_length})") return self._normalize_identifier(identifier, self.max_length) - def make_path(self, *identifiers: str) -> str: - # only non empty identifiers participate - return self.PATH_SEPARATOR.join(filter(lambda x: x.strip(), identifiers)) - - def break_path(self, path: str) -> Sequence[str]: - return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] - @staticmethod @lru_cache(maxsize=None) def _normalize_identifier(identifier: str, max_length: int) -> str: """Normalizes the identifier according to naming convention represented by this function""" # all characters that are not letters digits or a few special chars are replaced with underscore normalized_ident = identifier.translate(NamingConvention._TR_REDUCE_ALPHABET) - normalized_ident = NamingConvention._RE_NON_ALPHANUMERIC.sub("_", normalized_ident) + normalized_ident = NamingConvention.RE_NON_ALPHANUMERIC.sub("_", normalized_ident) # shorten identifier return NamingConvention.shorten_identifier( @@ -57,7 +62,7 @@ def _to_snake_case(cls, identifier: str) -> str: identifier = cls._SNAKE_CASE_BREAK_2.sub(r"\1_\2", identifier).lower() # leading digits will be prefixed (if regex is defined) - if cls._RE_LEADING_DIGITS and cls._RE_LEADING_DIGITS.match(identifier): + if cls.RE_LEADING_DIGITS and cls.RE_LEADING_DIGITS.match(identifier): identifier = "_" + identifier # replace trailing _ with x @@ -67,4 +72,4 @@ def _to_snake_case(cls, identifier: str) -> str: # identifier = cls._RE_ENDING_UNDERSCORES.sub("x", identifier) # replace consecutive underscores with single one to prevent name collisions with PATH_SEPARATOR - return cls._RE_UNDERSCORES.sub("_", stripped_ident) + return cls.RE_UNDERSCORES.sub("_", stripped_ident) diff --git a/dlt/common/normalizers/naming/sql_ci_v1.py b/dlt/common/normalizers/naming/sql_ci_v1.py index baabb7ecf7..4fff52ffd6 100644 --- a/dlt/common/normalizers/naming/sql_ci_v1.py +++ b/dlt/common/normalizers/naming/sql_ci_v1.py @@ -2,11 +2,11 @@ class NamingConvention(SqlCsNamingConvention): - def __init__(self, max_length: int = None) -> None: - """A variant of sql_cs which lower cases all identifiers.""" - - super().__init__(max_length) - self.is_case_sensitive = False + """A variant of sql_cs which lower cases all identifiers.""" def normalize_identifier(self, identifier: str) -> str: return super().normalize_identifier(identifier).lower() + + @property + def is_case_sensitive(self) -> bool: + return False diff --git a/dlt/common/normalizers/naming/sql_cs_v1.py b/dlt/common/normalizers/naming/sql_cs_v1.py index 93b93bbc89..788089fa7d 100644 --- a/dlt/common/normalizers/naming/sql_cs_v1.py +++ b/dlt/common/normalizers/naming/sql_cs_v1.py @@ -1,22 +1,44 @@ -from typing import Any, Sequence +import re +from typing import ClassVar +from dlt.common.typing import REPattern from dlt.common.normalizers.naming.naming import NamingConvention as BaseNamingConvention -# TODO: not yet finished + +RE_UNDERSCORES = re.compile("__+") +RE_LEADING_DIGITS = re.compile(r"^\d+") +RE_ENDING_UNDERSCORES = re.compile(r"_+$") +RE_NON_ALPHANUMERIC = re.compile(r"[^a-zA-Z\d_]+") class NamingConvention(BaseNamingConvention): - PATH_SEPARATOR = "__" + """Generates case sensitive SQL safe identifiers, preserving the source casing. + + - Spaces around identifier are trimmed + - Removes all ascii characters except ascii alphanumerics and underscores + - Prepends `_` if name starts with number. + - Removes all trailing underscores. + - Multiples of `_` are converted into single `_`. + """ - _CLEANUP_TABLE = str.maketrans(".\n\r'\"▶", "______") + RE_NON_ALPHANUMERIC: ClassVar[REPattern] = RE_NON_ALPHANUMERIC + RE_UNDERSCORES: ClassVar[REPattern] = RE_UNDERSCORES + RE_ENDING_UNDERSCORES: ClassVar[REPattern] = RE_ENDING_UNDERSCORES def normalize_identifier(self, identifier: str) -> str: identifier = super().normalize_identifier(identifier) - norm_identifier = identifier.translate(self._CLEANUP_TABLE) + # remove non alpha characters + norm_identifier = self.RE_NON_ALPHANUMERIC.sub("_", identifier) + # remove leading digits + if RE_LEADING_DIGITS.match(norm_identifier): + norm_identifier = "_" + norm_identifier + # remove trailing underscores to not mess with how we break paths + if norm_identifier != "_": + norm_identifier = self.RE_ENDING_UNDERSCORES.sub("", norm_identifier) + # contract multiple __ + norm_identifier = self.RE_UNDERSCORES.sub("_", norm_identifier) return self.shorten_identifier(norm_identifier, identifier, self.max_length) - def make_path(self, *identifiers: Any) -> str: - return self.PATH_SEPARATOR.join(filter(lambda x: x.strip(), identifiers)) - - def break_path(self, path: str) -> Sequence[str]: - return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] + @property + def is_case_sensitive(self) -> bool: + return True diff --git a/dlt/common/normalizers/typing.py b/dlt/common/normalizers/typing.py index 3903858091..9ea6f3cf11 100644 --- a/dlt/common/normalizers/typing.py +++ b/dlt/common/normalizers/typing.py @@ -1,8 +1,11 @@ -from typing import List, Optional, TypedDict, Union +from types import ModuleType +from typing import List, Optional, Type, TypedDict, Union from dlt.common.typing import StrAny from dlt.common.normalizers.naming import NamingConvention +TNamingConventionReferenceArg = Union[str, Type[NamingConvention], ModuleType] + class TJSONNormalizer(TypedDict, total=False): module: str @@ -10,7 +13,7 @@ class TJSONNormalizer(TypedDict, total=False): class TNormalizersConfig(TypedDict, total=False): - names: Union[str, NamingConvention] + names: str allow_identifier_change_on_table_with_data: Optional[bool] detections: Optional[List[str]] json: TJSONNormalizer diff --git a/dlt/common/normalizers/utils.py b/dlt/common/normalizers/utils.py index 49751980ff..beacf03e4e 100644 --- a/dlt/common/normalizers/utils.py +++ b/dlt/common/normalizers/utils.py @@ -1,20 +1,32 @@ -import inspect from importlib import import_module -from typing import Any, Dict, Optional, Type, Tuple, Union, cast, List +from types import ModuleType +from typing import Any, Dict, Optional, Type, Tuple, cast, List import dlt +from dlt.common import logger from dlt.common.configuration.inject import with_config from dlt.common.configuration.specs import known_sections from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.normalizers.configuration import NormalizersConfiguration +from dlt.common.normalizers.exceptions import InvalidJsonNormalizer from dlt.common.normalizers.json import SupportsDataItemNormalizer, DataItemNormalizer from dlt.common.normalizers.naming import NamingConvention -from dlt.common.normalizers.naming.exceptions import UnknownNamingModule, InvalidNamingModule -from dlt.common.normalizers.typing import TJSONNormalizer, TNormalizersConfig -from dlt.common.utils import uniq_id_base64, many_uniq_ids_base64 - -DEFAULT_NAMING_MODULE = "dlt.common.normalizers.naming.snake_case" +from dlt.common.normalizers.naming.exceptions import ( + NamingTypeNotFound, + UnknownNamingModule, + InvalidNamingType, +) +from dlt.common.normalizers.typing import ( + TJSONNormalizer, + TNormalizersConfig, + TNamingConventionReferenceArg, +) +from dlt.common.typing import is_subclass +from dlt.common.utils import get_full_class_name, uniq_id_base64, many_uniq_ids_base64 + +DEFAULT_NAMING_NAMESPACE = "dlt.common.normalizers.naming" DLT_ID_LENGTH_BYTES = 10 +DEFAULT_NAMING_MODULE = "snake_case" def _section_for_schema(kwargs: Dict[str, Any]) -> Tuple[str, ...]: @@ -27,16 +39,18 @@ def _section_for_schema(kwargs: Dict[str, Any]) -> Tuple[str, ...]: @with_config(spec=NormalizersConfiguration, sections=_section_for_schema) # type: ignore[call-overload] def explicit_normalizers( - naming: Union[str, NamingConvention] = dlt.config.value, + naming: TNamingConventionReferenceArg = dlt.config.value, json_normalizer: TJSONNormalizer = dlt.config.value, allow_identifier_change_on_table_with_data: bool = None, schema_name: Optional[str] = None, ) -> TNormalizersConfig: - """Gets explicitly configured normalizers - via config or destination caps. May return None as naming or normalizer + """Gets explicitly configured normalizers without any defaults or capabilities injection. If `naming` + is a module or a type it will get converted into string form via import. If `schema_name` is present, a section ("sources", schema_name, "schema") is used to inject the config """ - norm_conf: TNormalizersConfig = {"names": naming, "json": json_normalizer} + + norm_conf: TNormalizersConfig = {"names": serialize_reference(naming), "json": json_normalizer} if allow_identifier_change_on_table_with_data is not None: norm_conf["allow_identifier_change_on_table_with_data"] = ( allow_identifier_change_on_table_with_data @@ -46,78 +60,125 @@ def explicit_normalizers( @with_config def import_normalizers( - normalizers_config: TNormalizersConfig, + explicit_normalizers: TNormalizersConfig, + default_normalizers: TNormalizersConfig = None, destination_capabilities: DestinationCapabilitiesContext = None, ) -> Tuple[TNormalizersConfig, NamingConvention, Type[DataItemNormalizer[Any]]]: """Imports the normalizers specified in `normalizers_config` or taken from defaults. Returns the updated config and imported modules. - `destination_capabilities` are used to get max length of the identifier. + `destination_capabilities` are used to get naming convention, max length of the identifier and max nesting level. """ + if default_normalizers is None: + default_normalizers = {} # add defaults to normalizer_config - normalizers_config["names"] = names = normalizers_config["names"] or "snake_case" - normalizers_config["json"] = item_normalizer = normalizers_config.get("json") or {} + naming: TNamingConventionReferenceArg = explicit_normalizers.get("names") + if naming is None: + if destination_capabilities: + naming = destination_capabilities.naming_convention + if naming is None: + naming = default_normalizers.get("names") or DEFAULT_NAMING_MODULE + naming_convention = naming_from_reference(naming, destination_capabilities) + explicit_normalizers["names"] = serialize_reference(naming) + + item_normalizer = explicit_normalizers.get("json") or default_normalizers.get("json") or {} item_normalizer.setdefault("module", "dlt.common.normalizers.json.relational") - json_module = cast(SupportsDataItemNormalizer, import_module(item_normalizer["module"])) + # if max_table_nesting is set, we need to set the max_table_nesting in the json_normalizer + if destination_capabilities and destination_capabilities.max_table_nesting is not None: + # TODO: this is a hack, we need a better method to do this + from dlt.common.normalizers.json.relational import DataItemNormalizer + try: + DataItemNormalizer.ensure_this_normalizer(item_normalizer) + item_normalizer.setdefault("config", {}) + item_normalizer["config"]["max_nesting"] = destination_capabilities.max_table_nesting # type: ignore[index] + except InvalidJsonNormalizer: + # not a right normalizer + logger.warning(f"JSON Normalizer {item_normalizer} does not support max_nesting") + pass + json_module = cast(SupportsDataItemNormalizer, import_module(item_normalizer["module"])) + explicit_normalizers["json"] = item_normalizer return ( - normalizers_config, - naming_from_reference(names, destination_capabilities), + explicit_normalizers, + naming_convention, json_module.DataItemNormalizer, ) def naming_from_reference( - names: Union[str, NamingConvention], + names: TNamingConventionReferenceArg, destination_capabilities: DestinationCapabilitiesContext = None, ) -> NamingConvention: """Resolves naming convention from reference in `names` and applies max length from `destination_capabilities` - Reference may be: (1) actual instance of NamingConvention (2) shorthand name pointing to `dlt.common.normalizers.naming` namespace - (3) a type name which is a module containing `NamingConvention` attribute (4) a type of class deriving from NamingConvention + Reference may be: (1) shorthand name pointing to `dlt.common.normalizers.naming` namespace + (2) a type name which is a module containing `NamingConvention` attribute (3) a type of class deriving from NamingConvention """ - def _import_naming(module: str, cls: str) -> Type[NamingConvention]: - if "." in module or cls != "NamingConvention": + def _import_naming(module: str) -> ModuleType: + if "." in module: # TODO: bump schema engine version and migrate schema. also change the name in TNormalizersConfig from names to naming if module == "dlt.common.normalizers.names.snake_case": - module = DEFAULT_NAMING_MODULE + module = f"{DEFAULT_NAMING_NAMESPACE}.{DEFAULT_NAMING_MODULE}" # this is full module name naming_module = import_module(module) else: # from known location - naming_module = import_module(f"dlt.common.normalizers.naming.{module}") - class_ = getattr(naming_module, cls, None) + try: + naming_module = import_module(f"{DEFAULT_NAMING_NAMESPACE}.{module}") + except ImportError: + # also import local module + naming_module = import_module(module) + return naming_module + + def _get_type(naming_module: ModuleType, cls: str) -> Type[NamingConvention]: + class_: Type[NamingConvention] = getattr(naming_module, cls, None) if class_ is None: - raise UnknownNamingModule(module + "." + cls) - if inspect.isclass(class_) and issubclass(class_, NamingConvention): + raise NamingTypeNotFound(naming_module.__name__, cls) + if is_subclass(class_, NamingConvention): return class_ - raise InvalidNamingModule(module, cls) + raise InvalidNamingType(naming_module.__name__, cls) - if not isinstance(names, NamingConvention): + if is_subclass(names, NamingConvention): + class_: Type[NamingConvention] = names # type: ignore[assignment] + elif isinstance(names, ModuleType): + class_ = _get_type(names, "NamingConvention") + elif isinstance(names, str): try: - class_ = _import_naming(names, "NamingConvention") + class_ = _get_type(_import_naming(names), "NamingConvention") except ImportError: parts = names.rsplit(".", 1) # we have no more options to try if len(parts) <= 1: raise UnknownNamingModule(names) try: - class_ = _import_naming(*parts) + class_ = _get_type(_import_naming(parts[0]), parts[1]) except UnknownNamingModule: raise except ImportError: raise UnknownNamingModule(names) + else: + raise ValueError(names) - # get max identifier length - if destination_capabilities: - max_length = min( - destination_capabilities.max_identifier_length, - destination_capabilities.max_column_identifier_length, - ) - else: - max_length = None - names = class_(max_length) - return names + # get max identifier length + if destination_capabilities: + max_length = min( + destination_capabilities.max_identifier_length, + destination_capabilities.max_column_identifier_length, + ) + else: + max_length = None + + return class_(max_length) + + +def serialize_reference(naming: Optional[TNamingConventionReferenceArg]) -> Optional[str]: + """Serializes generic `naming` reference to importable string.""" + if naming is None: + return naming + if isinstance(naming, str): + return naming + # import reference and use naming to get valid path to type + return get_full_class_name(naming_from_reference(naming)) def generate_dlt_ids(n_ids: int) -> List[str]: diff --git a/dlt/common/schema/migrations.py b/dlt/common/schema/migrations.py index 1ef602a3f8..b64714ba19 100644 --- a/dlt/common/schema/migrations.py +++ b/dlt/common/schema/migrations.py @@ -29,7 +29,8 @@ def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> # current version of the schema current = cast(TStoredSchema, schema_dict) # add default normalizers and root hash propagation - current["normalizers"], _, _ = import_normalizers(explicit_normalizers()) + normalizers = explicit_normalizers() + current["normalizers"], _, _ = import_normalizers(normalizers, normalizers) current["normalizers"]["json"]["config"] = { "propagation": {"root": {"_dlt_id": "_dlt_root_id"}} } diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index fd0521cc14..52f8545587 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -13,7 +13,7 @@ ) from dlt.common.schema.migrations import migrate_schema -from dlt.common.utils import extend_list_deduplicated, get_full_class_name +from dlt.common.utils import extend_list_deduplicated from dlt.common.typing import ( DictStrAny, StrAny, @@ -648,13 +648,6 @@ def to_dict( remove_processing_hints: bool = False, bump_version: bool = True, ) -> TStoredSchema: - # prepare normalizers - if isinstance(self._normalizers_config["names"], NamingConvention): - normalizers_config = deepcopy(self._normalizers_config) - normalizers_config["names"] = get_full_class_name(normalizers_config["names"]) - else: - normalizers_config = self._normalizers_config - stored_schema: TStoredSchema = { "version": self._stored_version, "version_hash": self._stored_version_hash, @@ -662,7 +655,7 @@ def to_dict( "name": self._schema_name, "tables": self._schema_tables, "settings": self._settings, - "normalizers": normalizers_config, + "normalizers": self._normalizers_config, "previous_hashes": self._stored_previous_hashes, } if self._imported_version_hash and not remove_defaults: @@ -732,11 +725,7 @@ def update_normalizers(self) -> None: Default hints, preferred data types and normalize configs (ie. column propagation) are normalized as well. Regexes are included as long as textual parts can be extracted from an expression. """ - normalizers = explicit_normalizers(schema_name=self._schema_name) - # set the current values as defaults - normalizers["names"] = normalizers["names"] or self._normalizers_config["names"] - normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] - self._configure_normalizers(normalizers) + self._configure_normalizers(explicit_normalizers(schema_name=self._schema_name)) self._compile_settings() def set_schema_contract(self, settings: TSchemaContract) -> None: @@ -1051,10 +1040,12 @@ def _renormalize_schema_identifiers( if preferred_types := self.settings.get("preferred_types"): self._settings["preferred_types"] = self._normalize_preferred_types(preferred_types) - def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: + def _configure_normalizers(self, explicit_normalizers: TNormalizersConfig) -> None: """Gets naming and item normalizer from schema yaml, config providers and destination capabilities and applies them to schema.""" # import desired modules - normalizers_config, to_naming, item_normalizer_class = import_normalizers(normalizers) + normalizers_config, to_naming, item_normalizer_class = import_normalizers( + explicit_normalizers, self._normalizers_config + ) self._renormalize_schema_identifiers(normalizers_config, to_naming, self.naming) # data item normalization function self.data_item_normalizer = item_normalizer_class(self) diff --git a/dlt/destinations/impl/weaviate/ci_naming.py b/dlt/destinations/impl/weaviate/ci_naming.py index 63c94776ad..6e1b0c129e 100644 --- a/dlt/destinations/impl/weaviate/ci_naming.py +++ b/dlt/destinations/impl/weaviate/ci_naming.py @@ -2,10 +2,11 @@ class NamingConvention(WeaviateNamingConvention): - def __init__(self, max_length: int = None) -> None: - """Case insensitive naming convention for Weaviate. Lower cases all identifiers""" - super().__init__(max_length) - self.is_case_sensitive = False + """Case insensitive naming convention for Weaviate. Lower cases all identifiers""" + + @property + def is_case_sensitive(self) -> bool: + return False def _lowercase_property(self, identifier: str) -> str: """Lowercase the whole property to become case insensitive""" diff --git a/dlt/destinations/impl/weaviate/naming.py b/dlt/destinations/impl/weaviate/naming.py index 1e8e73a8e1..81a53dafd3 100644 --- a/dlt/destinations/impl/weaviate/naming.py +++ b/dlt/destinations/impl/weaviate/naming.py @@ -1,18 +1,20 @@ import re +from typing import ClassVar from dlt.common.normalizers.naming import NamingConvention as BaseNamingConvention from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention +from dlt.common.typing import REPattern class NamingConvention(SnakeCaseNamingConvention): """Normalizes identifiers according to Weaviate documentation: https://weaviate.io/developers/weaviate/config-refs/schema#class""" - def __init__(self, max_length: int = None) -> None: - super().__init__(max_length) - self.is_case_sensitive: bool = True + @property + def is_case_sensitive(self) -> bool: + return True RESERVED_PROPERTIES = {"id": "__id", "_id": "___id", "_additional": "__additional"} - _RE_UNDERSCORES = re.compile("([^_])__+") + RE_UNDERSCORES: ClassVar[REPattern] = re.compile("([^_])__+") _STARTS_DIGIT = re.compile("^[0-9]") _STARTS_NON_LETTER = re.compile("^[0-9_]") _SPLIT_UNDERSCORE_NON_CAP = re.compile("(_[^A-Z])") @@ -55,11 +57,11 @@ def _lowercase_property(self, identifier: str) -> str: def _base_normalize(self, identifier: str) -> str: # all characters that are not letters digits or a few special chars are replaced with underscore normalized_ident = identifier.translate(self._TR_REDUCE_ALPHABET) - normalized_ident = self._RE_NON_ALPHANUMERIC.sub("_", normalized_ident) + normalized_ident = self.RE_NON_ALPHANUMERIC.sub("_", normalized_ident) # replace trailing _ with x stripped_ident = normalized_ident.rstrip("_") strip_count = len(normalized_ident) - len(stripped_ident) stripped_ident += "x" * strip_count # replace consecutive underscores with single one to prevent name clashes with PATH_SEPARATOR - return self._RE_UNDERSCORES.sub(r"\1_", stripped_ident) + return self.RE_UNDERSCORES.sub(r"\1_", stripped_ident) diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py index 87ccffe53b..be1a03990b 100644 --- a/docs/examples/conftest.py +++ b/docs/examples/conftest.py @@ -1,3 +1,4 @@ +import sys import os import pytest from unittest.mock import patch @@ -47,7 +48,11 @@ def _initial_providers(): ): # extras work when container updated glob_ctx.add_extras() - yield + try: + sys.path.insert(0, dname) + yield + finally: + sys.path.pop(0) def pytest_configure(config): diff --git a/docs/examples/custom_naming/.dlt/config.toml b/docs/examples/custom_naming/.dlt/config.toml new file mode 100644 index 0000000000..ba5c8ab73a --- /dev/null +++ b/docs/examples/custom_naming/.dlt/config.toml @@ -0,0 +1,2 @@ +[sources.sql_ci_no_collision.schema] +naming="sql_ci_no_collision" \ No newline at end of file diff --git a/docs/examples/custom_naming/__init__.py b/docs/examples/custom_naming/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/examples/custom_naming/custom_naming.py b/docs/examples/custom_naming/custom_naming.py new file mode 100644 index 0000000000..e99e582213 --- /dev/null +++ b/docs/examples/custom_naming/custom_naming.py @@ -0,0 +1,90 @@ +""" +--- +title: Create and use own naming convention +description: We demonstrate how to create naming conventions that allow UNICODE letters and never generate collisions +keywords: [example] +--- + +This example shows how to add and use custom naming convention. Naming conventions translate identifiers found in source data into identifiers in +destination, where rules for a valid identifier are constrained. + +Custom naming conventions are classes that derive from `NamingConvention` that you can import from `dlt.common.normalizers.naming`. We recommend the following module layout: +1. Each naming convention resides in a separate Python module (file) +2. The class is always named `NamingConvention` + +There are two naming conventions in this example: +1. A variant of `sql_ci` that generates identifier collisions with a low (user defined) probability by appending a deterministic tag to each name. +2. A variant of `sql_cs` that allows for LATIN (ie. umlaut) characters + +With this example you will learn to: +* Create a naming convention module with a recommended layout +* Use naming convention by explicitly passing it to `duckdb` destination factory +* Use naming convention by configuring it config.toml +* Changing the declared case sensitivity by overriding `is_case_sensitive` property +* Providing custom normalization logic by overriding `normalize_identifier` method + +""" + +import dlt + +if __name__ == "__main__": + # sql_cs_latin2 module + import sql_cs_latin2 # type: ignore[import-not-found] + + # create postgres destination with a custom naming convention. pass sql_cs_latin2 as module + # NOTE: ql_cs_latin2 is case sensitive and postgres accepts UNICODE letters in identifiers + dest_ = dlt.destinations.postgres( + "postgresql://loader:loader@localhost:5432/dlt_data", naming_convention=sql_cs_latin2 + ) + # run a pipeline + pipeline = dlt.pipeline( + pipeline_name="sql_cs_latin2_pipeline", + destination=dest_, + dataset_name="example_data", + dev_mode=True, + ) + # Extract, normalize, and load the data + load_info = pipeline.run([{"StückId": 1}], table_name="Ausrüstung") + print(load_info) + # make sure nothing failed + load_info.raise_on_failed_jobs() + with pipeline.sql_client() as client: + # NOTE: we quote case sensitive identifers + with client.execute_query('SELECT "StückId" FROM "Ausrüstung"') as cur: + print(cur.description) + print(cur.fetchone()) + + # sql_ci_no_collision (configured in config toml) + # NOTE: pipeline with name `sql_ci_no_collision` will create default schema with the same name + # so we are free to use it in config.toml to just affect this pipeline and leave the postgres pipeline as it is + pipeline = dlt.pipeline( + pipeline_name="sql_ci_no_collision", + destination="duckdb", + dataset_name="example_data", + dev_mode=True, + ) + # duckdb is case insensitive so tables and columns below would clash but sql_ci_no_collision prevents that + data_1 = {"ItemID": 1, "itemid": "collides"} + load_info = pipeline.run([data_1], table_name="BigData") + load_info.raise_on_failed_jobs() + + data_2 = {"1Data": 1, "_1data": "collides"} + # use colliding table + load_info = pipeline.run([data_2], table_name="bigdata") + load_info.raise_on_failed_jobs() + + with pipeline.sql_client() as client: + from duckdb import DuckDBPyConnection + + conn: DuckDBPyConnection = client.native_connection + # tags are deterministic so we can just use the naming convention to get table names to select + first_table = pipeline.default_schema.naming.normalize_table_identifier("BigData") + sql = f"DESCRIBE TABLE {first_table}" + print(sql) + print(conn.sql(sql)) + second_table = pipeline.default_schema.naming.normalize_table_identifier("bigdata") + sql = f"DESCRIBE TABLE {second_table}" + print(sql) + print(conn.sql(sql)) + + # print(pipeline.default_schema.to_pretty_yaml()) diff --git a/docs/examples/custom_naming/sql_ci_no_collision.py b/docs/examples/custom_naming/sql_ci_no_collision.py new file mode 100644 index 0000000000..276107ea2b --- /dev/null +++ b/docs/examples/custom_naming/sql_ci_no_collision.py @@ -0,0 +1,34 @@ +from typing import ClassVar + +from dlt.common.normalizers.naming.sql_cs_v1 import NamingConvention as SqlNamingConvention +from dlt.common.schema.typing import DLT_NAME_PREFIX + + +class NamingConvention(SqlNamingConvention): + """Case insensitive naming convention with all identifiers lowercases but with unique short tag added""" + + # we will reuse the code we use for shortening + # 1 in 100 prob of collision for identifiers identical after normalization + _DEFAULT_COLLISION_PROB: ClassVar[float] = 0.01 + + def normalize_identifier(self, identifier: str) -> str: + # compute unique tag on original (not normalized) identifier + # NOTE: you may wrap method below in lru_cache if you often normalize the same names + tag = self._compute_tag(identifier, self._DEFAULT_COLLISION_PROB) + # lower case + norm_identifier = identifier.lower() + # add tag if (not a dlt identifier) and tag was not added before (simple heuristics) + if "_4" in norm_identifier: + _, existing_tag = norm_identifier.rsplit("_4", 1) + has_tag = len(existing_tag) == len(tag) + else: + has_tag = False + if not norm_identifier.startswith(DLT_NAME_PREFIX) and not has_tag: + norm_identifier = norm_identifier + "_4" + tag + # run identifier through standard sql cleaning and shortening + return super().normalize_identifier(norm_identifier) + + @property + def is_case_sensitive(self) -> bool: + # switch the naming convention to case insensitive + return False diff --git a/docs/examples/custom_naming/sql_cs_latin2.py b/docs/examples/custom_naming/sql_cs_latin2.py new file mode 100644 index 0000000000..7cf31cc76a --- /dev/null +++ b/docs/examples/custom_naming/sql_cs_latin2.py @@ -0,0 +1,21 @@ +from typing import ClassVar + +# NOTE: we use regex library that supports unicode +import regex as re + +from dlt.common.normalizers.naming.sql_cs_v1 import NamingConvention as SqlNamingConvention +from dlt.common.typing import REPattern + + +class NamingConvention(SqlNamingConvention): + """Case sensitive naming convention which allows basic unicode characters, including latin 2 characters""" + + RE_NON_ALPHANUMERIC: ClassVar[REPattern] = re.compile(r"[^\p{Latin}\d_]+") # type: ignore + + def normalize_identifier(self, identifier: str) -> str: + # typically you'd change how a single + return super().normalize_identifier(identifier) + + @property + def is_case_sensitive(self) -> bool: + return True diff --git a/docs/examples/postgres_to_postgres/postgres_to_postgres.py b/docs/examples/postgres_to_postgres/postgres_to_postgres.py index 848af53317..c6502f236a 100644 --- a/docs/examples/postgres_to_postgres/postgres_to_postgres.py +++ b/docs/examples/postgres_to_postgres/postgres_to_postgres.py @@ -91,7 +91,7 @@ def pg_resource_chunked( order_date: str, load_type: str = "merge", columns: str = "*", - credentials: ConnectionStringCredentials = dlt.secrets["sources.postgres.credentials"], + credentials: ConnectionStringCredentials = None, ): print( f"dlt.resource write_disposition: `{load_type}` -- ", @@ -162,6 +162,7 @@ def table_desc(table_name, pk, schema_name, order_date, columns="*"): table["order_date"], load_type=load_type, columns=table["columns"], + credentials=dlt.secrets["sources.postgres.credentials"], ) ) diff --git a/docs/tools/prepare_examples_tests.py b/docs/tools/prepare_examples_tests.py index 58e56cc15b..d39d311a50 100644 --- a/docs/tools/prepare_examples_tests.py +++ b/docs/tools/prepare_examples_tests.py @@ -54,8 +54,12 @@ os.unlink(test_example_file) continue - with open(example_file, "r", encoding="utf-8") as f: - lines = f.read().split("\n") + try: + with open(example_file, "r", encoding="utf-8") as f: + lines = f.read().split("\n") + except FileNotFoundError: + print(f"Example file {example_file} not found, test prep will be skipped") + continue processed_lines = TEST_HEADER.split("\n") main_clause_found = False @@ -64,8 +68,8 @@ # convert the main clause to a test function if line.startswith(MAIN_CLAUSE): main_clause_found = True - processed_lines.append("@skipifgithubfork") # skip on forks - processed_lines.append("@pytest.mark.forked") # skip on forks + processed_lines.append("@skipifgithubfork") # skip on forks + processed_lines.append("@pytest.mark.forked") # skip on forks processed_lines.append(f"def test_{example}():") else: processed_lines.append(line) diff --git a/docs/website/docs/general-usage/naming-convention.md b/docs/website/docs/general-usage/naming-convention.md index c24b6c4869..72db7bf5f3 100644 --- a/docs/website/docs/general-usage/naming-convention.md +++ b/docs/website/docs/general-usage/naming-convention.md @@ -21,24 +21,16 @@ You can pick which naming convention to use. `dlt` provides a few to [choose fro ::: ### Use default naming convention (snake_case) -`dlt` most used and tested with default, case insensitive, lower case naming convention called **snake_case** +Case insensitive naming convention, converting source identifiers into lower case snake case with reduced alphabet. -1. Converts identifiers to **snake_case**, small caps. Removes all ascii characters except ascii - alphanumerics and underscores. -1. Adds `_` if name starts with number. -1. Multiples of `_` are converted into single `_`. -1. The parent-child relation is expressed as double `_` in names. -1. It shorts the identifier if it exceed the length at the destination. +- Spaces around identifier are trimmed +- Keeps ascii alphanumerics and underscores, replaces all other characters with underscores (with the exceptions below) +- Replaces `+` and `*` with `x`, `-` with `_`, `@` with `a` and `|` with `l` +- Prepends `_` if name starts with number. +- Multiples of `_` are converted into single `_`. +- Replaces all trailing `_` with `x` -> 💡 Standard behavior of `dlt` is to **use the same naming convention for all destinations** so -> users see always the same tables and columns in their databases. - -> 💡 If you provide any schema elements that contain identifiers via decorators or arguments (i.e. -> `table_name` or `columns`) all the names used will be converted via the naming convention when -> adding to the schema. For example if you execute `dlt.run(... table_name="CamelCase")` the data -> will be loaded into `camel_case`. - -> 💡 Use simple, short small caps identifiers for everything! +Uses __ as patent-child separator for tables and flattened column names. :::tip If you do not like **snake_case** your next safe option is **sql_ci** which generates SQL-safe, lower-case, case-insensitive identifiers without any @@ -55,7 +47,7 @@ naming="sql_ci_v1" ### Pick the right identifier form when defining resources `dlt` keeps source (not normalized) identifiers during data [extraction](../reference/explainers/how-dlt-works.md#extract) and translates them during [normalization](../reference/explainers/how-dlt-works.md#normalize). For you it means: 1. If you write a [transformer](resource.md#process-resources-with-dlttransformer) or a [mapping/filtering function](resource.md#filter-transform-and-pivot-data), you will see the original data, without any normalization. Use the source key names to access the dicts! -2. If you define a `primary_key` or `cursor` that participate in [incremental loading](incremental-loading.md#incremental-loading-with-a-cursor-field) use the source identifiers (as `dlt` will inspect the source data). +2. If you define a `primary_key` or `cursor` that participate in [cursor field incremental loading](incremental-loading.md#incremental-loading-with-a-cursor-field) use the source identifiers (`dlt` uses them to inspect source data, `Incremental` class is a filtering function). 3. When defining any other hints ie. `columns` or `merge_key` you can pick source or destination identifiers. `dlt` normalizes all hints together with your data. 4. `Schema` object (ie. obtained from the pipeline or from `dlt` source via `discover_schema`) **always contains destination (normalized) identifiers**. @@ -66,16 +58,20 @@ In the snippet below, we define a resource with various "illegal" unicode charac ### Understand the identifier normalization Identifiers are translated from source to destination form in **normalize** step. Here's how `dlt` picks the right naming convention: -* Each destination has a preferred naming convention. -* This naming convention is used when new schemas are created. -* Schemas preserve naming convention when saved -* `dlt` applies final naming convention in `normalize` step. Naming convention comes from (1) explicit configuration (2) from destination capabilities. Naming convention -in schema will be ignored. -* You can change the naming convention in the capabilities: (name, case-folding, case sensitivity) +* Each destination may define a preferred naming convention (ie. Weaviate), otherwise **snake case** will be used +* This naming convention is used when new schemas are created. This happens when pipeline is run for a first time. +* Schemas preserve naming convention when saved. Your running pipelines will maintain existing naming conventions if not requested otherwise +* `dlt` applies final naming convention in `normalize` step. Naming convention comes from (1) explicit configuration (2) from destination capabilities. +* Naming convention will be used to put destination is case sensitive/insensitive mode and apply the right case folding function. + +:::caution +If you change naming convention and `dlt` detects that it changes the destination identifiers for tables/collection/files that already exist and store data, +the normalize process will fail. +::: ### Case sensitive and insensitive destinations Naming conventions come in two types. -* **case sensitive** naming convention normalize source identifiers into case sensitive identifiers where character +* **case sensitive** * **case insensitive** Case sensitive naming convention will put a destination in [case sensitive mode](destination.md#control-how-dlt-creates-table-column-and-other-identifiers). Identifiers that @@ -90,24 +86,43 @@ too long. The default shortening behavior generates short deterministic hashes o ## Pick your own naming convention ### Configure naming convention -The naming convention is configurable and users can easily create their own -conventions that i.e. pass all the identifiers unchanged if the destination accepts that (i.e. -DuckDB). +tbd. ### Available naming conventions +* snake_case +* duck_case - case sensitive, allows all unicode characters like emoji 💥 +* direct - case sensitive, allows all unicode characters, does not contract underscores +* `sql_cs_v1` - case sensitive, generates sql-safe identifiers +* `sql_ci_v1` - case insensitive, generates sql-safe lower case identifiers + ### Set and adjust naming convention explicitly +tbd. ## Avoid identifier collisions - - `dlt` detects various types of collisions and ignores the others. +1. `dlt` detects collisions if case sensitive naming convention is used on case insensitive destination +2. `dlt` detects collisions if change of naming convention changes the identifiers of tables already created in the destination +3. `dlt` detects collisions when naming convention is applied to column names of arrow tables + +`dlt` will not detect collision when normalizing source data. If you have a dictionary, keys will be merged if they collide after being normalized. +You can use a naming convention that does not generate collisions, see examples below. ## Write your own naming convention -Naming conventions reside in separate Python modules, are classes with `NamingConvention` name and must derive from `BaseNamingConvention`. We include two examples of -naming conventions that you may find useful +Custom naming conventions are classes that derive from `NamingConvention` that you can import from `dlt.common.normalizers.naming`. We recommend the following module layout: +1. Each naming convention resides in a separate Python module (file) +2. The class is always named `NamingConvention` + +In that case you can use a fully qualified module name in [schema configuration](#configure-naming-convention) or pass module [explicitly](#set-and-adjust-naming-convention-explicitly). + +We include [two examples](../examples/custom_naming) of naming conventions that you may find useful: 1. A variant of `sql_ci` that generates identifier collisions with a low (user defined) probability by appending a deterministic tag to each name. -2. A variant of `sql_cs` that allows for LATIN-2 (ie. umlaut) characters +2. A variant of `sql_cs` that allows for LATIN (ie. umlaut) characters + +:::note +Note that a fully qualified name of your custom naming convention will be stored in the `Schema` and `dlt` will attempt to import it when schema is loaded from storage. +You should distribute your custom naming conventions with your pipeline code via an installable package with a defined namespace. +::: diff --git a/poetry.lock b/poetry.lock index 1543d079c2..2cef57424d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -3552,6 +3552,164 @@ files = [ {file = "google_re2-1.1-1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c6c9f64b9724ec38da8e514f404ac64e9a6a5e8b1d7031c2dadd05c1f4c16fd"}, {file = "google_re2-1.1-1-cp39-cp39-win32.whl", hash = "sha256:d1b751b9ab9f8e2ab2a36d72b909281ce65f328c9115a1685acae1a2d1afd7a4"}, {file = "google_re2-1.1-1-cp39-cp39-win_amd64.whl", hash = "sha256:ac775c75cec7069351d201da4e0fb0cae4c1c5ebecd08fa34e1be89740c1d80b"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5eaefe4705b75ca5f78178a50104b689e9282f868e12f119b26b4cffc0c7ee6e"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:e35f2c8aabfaaa4ce6420b3cae86c0c29042b1b4f9937254347e9b985694a171"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:35fd189cbaaaa39c9a6a8a00164c8d9c709bacd0c231c694936879609beff516"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:60475d222cebd066c80414831c8a42aa2449aab252084102ee05440896586e6a"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:871cb85b9b0e1784c983b5c148156b3c5314cb29ca70432dff0d163c5c08d7e5"}, + {file = "google_re2-1.1-2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:94f4e66e34bdb8de91ec6cdf20ba4fa9fea1dfdcfb77ff1f59700d01a0243664"}, + {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1563577e2b720d267c4cffacc0f6a2b5c8480ea966ebdb1844fbea6602c7496f"}, + {file = "google_re2-1.1-2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:49b7964532a801b96062d78c0222d155873968f823a546a3dbe63d73f25bb56f"}, + {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2362fd70eb639a75fd0187d28b4ba7b20b3088833d8ad7ffd8693d0ba159e1c2"}, + {file = "google_re2-1.1-2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86b80719636a4e21391e20a9adf18173ee6ae2ec956726fe2ff587417b5e8ba6"}, + {file = "google_re2-1.1-2-cp310-cp310-win32.whl", hash = "sha256:5456fba09df951fe8d1714474ed1ecda102a68ddffab0113e6c117d2e64e6f2b"}, + {file = "google_re2-1.1-2-cp310-cp310-win_amd64.whl", hash = "sha256:2ac6936a3a60d8d9de9563e90227b3aea27068f597274ca192c999a12d8baa8f"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d5a87b436028ec9b0f02fe19d4cbc19ef30441085cdfcdf1cce8fbe5c4bd5e9a"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:fc0d4163de9ed2155a77e7a2d59d94c348a6bbab3cff88922fab9e0d3d24faec"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:48b12d953bc796736e7831d67b36892fb6419a4cc44cb16521fe291e594bfe23"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:62c780c927cff98c1538439f0ff616f48a9b2e8837c676f53170d8ae5b9e83cb"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:04b2aefd768aa4edeef8b273327806c9cb0b82e90ff52eacf5d11003ac7a0db2"}, + {file = "google_re2-1.1-2-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9c90175992346519ee7546d9af9a64541c05b6b70346b0ddc54a48aa0d3b6554"}, + {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22ad9ad9d125249d6386a2e80efb9de7af8260b703b6be7fa0ab069c1cf56ced"}, + {file = "google_re2-1.1-2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f70971f6ffe5254e476e71d449089917f50ebf9cf60f9cec80975ab1693777e2"}, + {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f267499529e64a4abed24c588f355ebe4700189d434d84a7367725f5a186e48d"}, + {file = "google_re2-1.1-2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b632eff5e4cd44545a9c0e52f2e1becd55831e25f4dd4e0d7ec8ee6ca50858c1"}, + {file = "google_re2-1.1-2-cp311-cp311-win32.whl", hash = "sha256:a42c733036e8f242ee4e5f0e27153ad4ca44ced9e4ce82f3972938ddee528db0"}, + {file = "google_re2-1.1-2-cp311-cp311-win_amd64.whl", hash = "sha256:64f8eed4ca96905d99b5286b3d14b5ca4f6a025ff3c1351626a7df2f93ad1ddd"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5541efcca5b5faf7e0d882334a04fa479bad4e7433f94870f46272eec0672c4a"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:92309af35b6eb2d3b3dc57045cdd83a76370958ab3e0edd2cc4638f6d23f5b32"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:197cd9bcaba96d18c5bf84d0c32fca7a26c234ea83b1d3083366f4392cb99f78"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:1b896f171d29b541256cf26e10dccc9103ac1894683914ed88828ca6facf8dca"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:e022d3239b945014e916ca7120fee659b246ec26c301f9e0542f1a19b38a8744"}, + {file = "google_re2-1.1-2-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:2c73f8a9440873b68bee1198094377501065e85aaf6fcc0d2512c7589ffa06ca"}, + {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:901d86555bd7725506d651afaba7d71cd4abd13260aed6cfd7c641a45f76d4f6"}, + {file = "google_re2-1.1-2-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ce4710ff636701cfb56eb91c19b775d53b03749a23b7d2a5071bbbf4342a9067"}, + {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:76a20e5ebdf5bc5d430530197e42a2eeb562f729d3a3fb51f39168283d676e66"}, + {file = "google_re2-1.1-2-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77c9f4d4bb1c8de9d2642d3c4b8b615858ba764df025b3b4f1310266f8def269"}, + {file = "google_re2-1.1-2-cp38-cp38-win32.whl", hash = "sha256:94bd60785bf37ef130a1613738e3c39465a67eae3f3be44bb918540d39b68da3"}, + {file = "google_re2-1.1-2-cp38-cp38-win_amd64.whl", hash = "sha256:59efeb77c0dcdbe37794c61f29c5b1f34bc06e8ec309a111ccdd29d380644d70"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:221e38c27e1dd9ccb8e911e9c7aed6439f68ce81e7bb74001076830b0d6e931d"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:d9145879e6c2e1b814445300b31f88a675e1f06c57564670d95a1442e8370c27"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c8a12f0740e2a52826bdbf95569a4b0abdf413b4012fa71e94ad25dd4715c6e5"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:9c9998f71466f4db7bda752aa7c348b2881ff688e361108fe500caad1d8b9cb2"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:0c39f69b702005963a3d3bf78743e1733ad73efd7e6e8465d76e3009e4694ceb"}, + {file = "google_re2-1.1-2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:6d0ce762dee8d6617d0b1788a9653e805e83a23046c441d0ea65f1e27bf84114"}, + {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ecf3619d98c9b4a7844ab52552ad32597cdbc9a5bdbc7e3435391c653600d1e2"}, + {file = "google_re2-1.1-2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9a1426a8cbd1fa004974574708d496005bd379310c4b1c7012be4bc75efde7a8"}, + {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1a30626ba48b4070f3eab272d860ef1952e710b088792c4d68dddb155be6bfc"}, + {file = "google_re2-1.1-2-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b9c1ffcfbc3095b6ff601ec2d2bf662988f6ea6763bc1c9d52bec55881f8fde"}, + {file = "google_re2-1.1-2-cp39-cp39-win32.whl", hash = "sha256:32ecf995a252c0548404c1065ba4b36f1e524f1f4a86b6367a1a6c3da3801e30"}, + {file = "google_re2-1.1-2-cp39-cp39-win_amd64.whl", hash = "sha256:e7865410f3b112a3609739283ec3f4f6f25aae827ff59c6bfdf806fd394d753e"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3b21f83f0a201009c56f06fcc7294a33555ede97130e8a91b3f4cae01aed1d73"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b38194b91354a38db1f86f25d09cdc6ac85d63aee4c67b43da3048ce637adf45"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e7da3da8d6b5a18d6c3b61b11cc5b66b8564eaedce99d2312b15b6487730fc76"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:aeca656fb10d8638f245331aabab59c9e7e051ca974b366dd79e6a9efb12e401"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:2069d6dc94f5fa14a159bf99cad2f11e9c0f8ec3b7f44a4dde9e59afe5d1c786"}, + {file = "google_re2-1.1-3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:2319a39305a4931cb5251451f2582713418a19bef2af7adf9e2a7a0edd939b99"}, + {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb98fc131699756c6d86246f670a5e1c1cc1ba85413c425ad344cb30479b246c"}, + {file = "google_re2-1.1-3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a6e038986d8ffe4e269f8532f03009f229d1f6018d4ac0dabc8aff876338f6e0"}, + {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8618343ee658310e0f53bf586fab7409de43ce82bf8d9f7eb119536adc9783fd"}, + {file = "google_re2-1.1-3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8140ca861cfe00602319cefe2c7b8737b379eb07fb328b51dc44584f47a2718"}, + {file = "google_re2-1.1-3-cp310-cp310-win32.whl", hash = "sha256:41f439c5c54e8a3a0a1fa2dbd1e809d3f643f862df7b16dd790f36a1238a272e"}, + {file = "google_re2-1.1-3-cp310-cp310-win_amd64.whl", hash = "sha256:fe20e97a33176d96d3e4b5b401de35182b9505823abea51425ec011f53ef5e56"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c39ff52b1765db039f690ee5b7b23919d8535aae94db7996079fbde0098c4d7"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5420be674fd164041639ba4c825450f3d4bd635572acdde16b3dcd697f8aa3ef"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ff53881cf1ce040f102a42d39db93c3f835f522337ae9c79839a842f26d97733"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:8d04600b0b53523118df2e413a71417c408f20dee640bf07dfab601c96a18a77"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:c4835d4849faa34a7fa1074098d81c420ed6c0707a3772482b02ce14f2a7c007"}, + {file = "google_re2-1.1-3-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:3309a9b81251d35fee15974d0ae0581a9a375266deeafdc3a3ac0d172a742357"}, + {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2b51cafee7e0bc72d0a4a454547bd8f257cde412ac9f1a2dc46a203b5e42cf4"}, + {file = "google_re2-1.1-3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:83f5f1cb52f832c2297d271ee8c56cf5e9053448162e5d2223d513f729bad908"}, + {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55865a1ace92be3f7953b2e2b38b901d8074a367aa491daee43260a53a7fc6f0"}, + {file = "google_re2-1.1-3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cec2167dd142e583e98c783bd0d28b8cf5a9cdbe1f7407ba4163fe3ccb613cb9"}, + {file = "google_re2-1.1-3-cp311-cp311-win32.whl", hash = "sha256:a0bc1fe96849e4eb8b726d0bba493f5b989372243b32fe20729cace02e5a214d"}, + {file = "google_re2-1.1-3-cp311-cp311-win_amd64.whl", hash = "sha256:e6310a156db96fc5957cb007dd2feb18476898654530683897469447df73a7cd"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8e63cd10ea006088b320e8c5d308da1f6c87aa95138a71c60dd7ca1c8e91927e"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:12b566830a334178733a85e416b1e0507dbc0ceb322827616fe51ef56c5154f1"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:442e18c9d46b225c1496919c16eafe8f8d9bb4091b00b4d3440da03c55bbf4ed"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:c54c00263a9c39b2dacd93e9636319af51e3cf885c080b9680a9631708326460"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:15a3caeeb327bc22e0c9f95eb76890fec8874cacccd2b01ff5c080ab4819bbec"}, + {file = "google_re2-1.1-3-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:59ec0d2cced77f715d41f6eafd901f6b15c11e28ba25fe0effdc1de554d78e75"}, + {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:185bf0e3441aed3840590f8e42f916e2920d235eb14df2cbc2049526803d3e71"}, + {file = "google_re2-1.1-3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:586d3f2014eea5be14d8de53374d9b79fa99689160e00efa64b5fe93af326087"}, + {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc2575082de4ffd234d9607f3ae67ca22b15a1a88793240e2045f3b3a36a5795"}, + {file = "google_re2-1.1-3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:59c5ad438eddb3630def394456091284d7bbc5b89351987f94f3792d296d1f96"}, + {file = "google_re2-1.1-3-cp312-cp312-win32.whl", hash = "sha256:5b9878c53f2bf16f75bf71d4ddd57f6611351408d5821040e91c53ebdf82c373"}, + {file = "google_re2-1.1-3-cp312-cp312-win_amd64.whl", hash = "sha256:4fdecfeb213110d0a85bad335a8e7cdb59fea7de81a4fe659233f487171980f9"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2dd87bacab32b709c28d0145fe75a956b6a39e28f0726d867375dba5721c76c1"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:55d24c61fe35dddc1bb484593a57c9f60f9e66d7f31f091ef9608ed0b6dde79f"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a0cf1180d908622df648c26b0cd09281f92129805ccc56a39227fdbfeab95cb4"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:09586f07f3f88d432265c75976da1c619ab7192cd7ebdf53f4ae0776c19e4b56"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:539f1b053402203576e919a06749198da4ae415931ee28948a1898131ae932ce"}, + {file = "google_re2-1.1-3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:abf0bcb5365b0e27a5a23f3da403dffdbbac2c0e3a3f1535a8b10cc121b5d5fb"}, + {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:19c83e5bbed7958213eeac3aa71c506525ce54faf03e07d0b96cd0a764890511"}, + {file = "google_re2-1.1-3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3348e77330ff672dc44ec01894fa5d93c409a532b6d688feac55e714e9059920"}, + {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:06b63edb57c5ce5a13eabfd71155e346b9477dc8906dec7c580d4f70c16a7e0d"}, + {file = "google_re2-1.1-3-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:12fe57ba2914092b83338d61d8def9ebd5a2bd0fd8679eceb5d4c2748105d5c0"}, + {file = "google_re2-1.1-3-cp38-cp38-win32.whl", hash = "sha256:80796e08d24e606e675019fe8de4eb5c94bb765be13c384f2695247d54a6df75"}, + {file = "google_re2-1.1-3-cp38-cp38-win_amd64.whl", hash = "sha256:3c2257dedfe7cc5deb6791e563af9e071a9d414dad89e37ac7ad22f91be171a9"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43a0cd77c87c894f28969ac622f94b2e6d1571261dfdd785026848a25cfdc9b9"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:1038990b77fd66f279bd66a0832b67435ea925e15bb59eafc7b60fdec812b616"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fb5dda6875d18dd45f0f24ebced6d1f7388867c8fb04a235d1deab7ea479ce38"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bb1d164965c6d57a351b421d2f77c051403766a8b75aaa602324ee2451fff77f"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a072ebfa495051d07ffecbf6ce21eb84793568d5c3c678c00ed8ff6b8066ab31"}, + {file = "google_re2-1.1-3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:4eb66c8398c8a510adc97978d944b3b29c91181237218841ea1a91dc39ec0e54"}, + {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f7c8b57b1f559553248d1757b7fa5b2e0cc845666738d155dff1987c2618264e"}, + {file = "google_re2-1.1-3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9162f6aa4f25453c682eb176f21b8e2f40205be9f667e98a54b3e1ff10d6ee75"}, + {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2d65ddf67fd7bf94705626871d463057d3d9a3538d41022f95b9d8f01df36e1"}, + {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d140c7b9395b4d1e654127aa1c99bcc603ed01000b7bc7e28c52562f1894ec12"}, + {file = "google_re2-1.1-3-cp39-cp39-win32.whl", hash = "sha256:80c5fc200f64b2d903eeb07b8d6cefc620a872a0240c7caaa9aca05b20f5568f"}, + {file = "google_re2-1.1-3-cp39-cp39-win_amd64.whl", hash = "sha256:9eb6dbcee9b5dc4069bbc0634f2eb039ca524a14bed5868fdf6560aaafcbca06"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0db114d7e1aa96dbcea452a40136d7d747d60cbb61394965774688ef59cccd4e"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:82133958e003a1344e5b7a791b9a9dd7560b5c8f96936dbe16f294604524a633"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:9e74fd441d1f3d917d3303e319f61b82cdbd96b9a5ba919377a6eef1504a1e2b"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:734a2e7a4541c57253b5ebee24f3f3366ba3658bcad01da25fb623c78723471a"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:d88d5eecbc908abe16132456fae13690d0508f3ac5777f320ef95cb6cab9a961"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:b91db80b171ecec435a07977a227757dd487356701a32f556fa6fca5d0a40522"}, + {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b23129887a64bb9948af14c84705273ed1a40054e99433b4acccab4dcf6a226"}, + {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dc1a0cc7cd19261dcaf76763e2499305dbb7e51dc69555167cdb8af98782698"}, + {file = "google_re2-1.1-4-cp310-cp310-win32.whl", hash = "sha256:3b2ab1e2420b5dd9743a2d6bc61b64e5f708563702a75b6db86637837eaeaf2f"}, + {file = "google_re2-1.1-4-cp310-cp310-win_amd64.whl", hash = "sha256:92efca1a7ef83b6df012d432a1cbc71d10ff42200640c0f9a5ff5b343a48e633"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:854818fd4ce79787aca5ba459d6e5abe4ca9be2c684a5b06a7f1757452ca3708"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:4ceef51174b6f653b6659a8fdaa9c38960c5228b44b25be2a3bcd8566827554f"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:ee49087c3db7e6f5238105ab5299c09e9b77516fe8cfb0a37e5f1e813d76ecb8"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:dc2312854bdc01410acc5d935f1906a49cb1f28980341c20a68797ad89d8e178"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0dc0d2e42296fa84a3cb3e1bd667c6969389cd5cdf0786e6b1f911ae2d75375b"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6bf04ced98453b035f84320f348f67578024f44d2997498def149054eb860ae8"}, + {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d6b6ef11dc4ab322fa66c2f3561925f2b5372a879c3ed764d20e939e2fd3e5f"}, + {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0dcde6646fa9a97fd3692b3f6ae7daf7f3277d7500b6c253badeefa11db8956a"}, + {file = "google_re2-1.1-4-cp311-cp311-win32.whl", hash = "sha256:5f4f0229deb057348893574d5b0a96d055abebac6debf29d95b0c0e26524c9f6"}, + {file = "google_re2-1.1-4-cp311-cp311-win_amd64.whl", hash = "sha256:4713ddbe48a18875270b36a462b0eada5e84d6826f8df7edd328d8706b6f9d07"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:40a698300b8faddbb325662973f839489c89b960087060bd389c376828978a04"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:103d2d7ac92ba23911a151fd1fc7035cbf6dc92a7f6aea92270ebceb5cd5acd3"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:51fb7182bccab05e8258a2b6a63dda1a6b4a9e8dfb9b03ec50e50c49c2827dd4"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:65383022abd63d7b620221eba7935132b53244b8b463d8fdce498c93cf58b7b7"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396281fc68a9337157b3ffcd9392c6b7fcb8aab43e5bdab496262a81d56a4ecc"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8198adcfcff1c680e052044124621730fc48d08005f90a75487f5651f1ebfce2"}, + {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81f7bff07c448aec4db9ca453d2126ece8710dbd9278b8bb09642045d3402a96"}, + {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7dacf730fd7d6ec71b11d6404b0b26e230814bfc8e9bb0d3f13bec9b5531f8d"}, + {file = "google_re2-1.1-4-cp312-cp312-win32.whl", hash = "sha256:8c764f62f4b1d89d1ef264853b6dd9fee14a89e9b86a81bc2157fe3531425eb4"}, + {file = "google_re2-1.1-4-cp312-cp312-win_amd64.whl", hash = "sha256:0be2666df4bc5381a5d693585f9bbfefb0bfd3c07530d7e403f181f5de47254a"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:5cb1b63a0bfd8dd65d39d2f3b2e5ae0a06ce4b2ce5818a1d1fc78a786a252673"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e41751ce6b67a95230edd0772226dc94c2952a2909674cd69df9804ed0125307"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:b998cfa2d50bf4c063e777c999a7e8645ec7e5d7baf43ad71b1e2e10bb0300c3"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:226ca3b0c2e970f3fc82001ac89e845ecc7a4bb7c68583e7a76cda70b61251a7"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:9adec1f734ebad7c72e56c85f205a281d8fe9bf6583bc21020157d3f2812ce89"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:9c34f3c64ba566af967d29e11299560e6fdfacd8ca695120a7062b6ed993b179"}, + {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b85385fe293838e0d0b6e19e6c48ba8c6f739ea92ce2e23b718afe7b343363"}, + {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4694daa8a8987cfb568847aa872f9990e930c91a68c892ead876411d4b9012c3"}, + {file = "google_re2-1.1-4-cp38-cp38-win32.whl", hash = "sha256:5e671e9be1668187e2995aac378de574fa40df70bb6f04657af4d30a79274ce0"}, + {file = "google_re2-1.1-4-cp38-cp38-win_amd64.whl", hash = "sha256:f66c164d6049a8299f6dfcfa52d1580576b4b9724d6fcdad2f36f8f5da9304b6"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:25cb17ae0993a48c70596f3a3ef5d659638106401cc8193f51c0d7961b3b3eb7"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:5f101f86d14ca94ca4dcf63cceaa73d351f2be2481fcaa29d9e68eeab0dc2a88"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4e82591e85bf262a6d74cff152867e05fc97867c68ba81d6836ff8b0e7e62365"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:1f61c09b93ffd34b1e2557e5a9565039f935407a5786dbad46f64f1a484166e6"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:12b390ad8c7e74bab068732f774e75e0680dade6469b249a721f3432f90edfc3"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1284343eb31c2e82ed2d8159f33ba6842238a56782c881b07845a6d85613b055"}, + {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c7b38e0daf2c06e4d3163f4c732ab3ad2521aecfed6605b69e4482c612da303"}, + {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, + {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, + {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, ] [[package]] @@ -4540,10 +4698,13 @@ files = [ {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"}, + {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"}, + {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"}, {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"}, + {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"}, {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"}, @@ -4552,6 +4713,7 @@ files = [ {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"}, {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"}, + {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"}, {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"}, @@ -4571,6 +4733,7 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"}, + {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"}, @@ -4580,6 +4743,7 @@ files = [ {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"}, + {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"}, @@ -4589,6 +4753,7 @@ files = [ {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"}, + {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"}, @@ -4598,6 +4763,7 @@ files = [ {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"}, {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"}, + {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"}, @@ -4608,13 +4774,16 @@ files = [ {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"}, {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"}, + {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"}, + {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"}, + {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"}, {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"}, @@ -4775,6 +4944,16 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, + {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -6836,6 +7015,7 @@ files = [ {file = "pymongo-4.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab6bcc8e424e07c1d4ba6df96f7fb963bcb48f590b9456de9ebd03b88084fe8"}, {file = "pymongo-4.6.0-cp312-cp312-win32.whl", hash = "sha256:47aa128be2e66abd9d1a9b0437c62499d812d291f17b55185cb4aa33a5f710a4"}, {file = "pymongo-4.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:014e7049dd019a6663747ca7dae328943e14f7261f7c1381045dfc26a04fa330"}, + {file = "pymongo-4.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e24025625bad66895b1bc3ae1647f48f0a92dd014108fb1be404c77f0b69ca67"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:288c21ab9531b037f7efa4e467b33176bc73a0c27223c141b822ab4a0e66ff2a"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:747c84f4e690fbe6999c90ac97246c95d31460d890510e4a3fa61b7d2b87aa34"}, {file = "pymongo-4.6.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:055f5c266e2767a88bb585d01137d9c7f778b0195d3dbf4a487ef0638be9b651"}, @@ -7276,6 +7456,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -7283,8 +7464,16 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -7301,6 +7490,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -7308,6 +7498,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -8231,6 +8422,7 @@ files = [ {file = "SQLAlchemy-1.4.49-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:03db81b89fe7ef3857b4a00b63dedd632d6183d4ea5a31c5d8a92e000a41fc71"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:95b9df9afd680b7a3b13b38adf6e3a38995da5e162cc7524ef08e3be4e5ed3e1"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a63e43bf3f668c11bb0444ce6e809c1227b8f067ca1068898f3008a273f52b09"}, + {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca46de16650d143a928d10842939dab208e8d8c3a9a8757600cae9b7c579c5cd"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f835c050ebaa4e48b18403bed2c0fda986525896efd76c245bdd4db995e51a4c"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c21b172dfb22e0db303ff6419451f0cac891d2e911bb9fbf8003d717f1bcf91"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-win32.whl", hash = "sha256:5fb1ebdfc8373b5a291485757bd6431de8d7ed42c27439f543c81f6c8febd729"}, @@ -8240,26 +8432,35 @@ files = [ {file = "SQLAlchemy-1.4.49-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5debe7d49b8acf1f3035317e63d9ec8d5e4d904c6e75a2a9246a119f5f2fdf3d"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win32.whl", hash = "sha256:82b08e82da3756765c2e75f327b9bf6b0f043c9c3925fb95fb51e1567fa4ee87"}, {file = "SQLAlchemy-1.4.49-cp311-cp311-win_amd64.whl", hash = "sha256:171e04eeb5d1c0d96a544caf982621a1711d078dbc5c96f11d6469169bd003f1"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f23755c384c2969ca2f7667a83f7c5648fcf8b62a3f2bbd883d805454964a800"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8396e896e08e37032e87e7fbf4a15f431aa878c286dc7f79e616c2feacdb366c"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66da9627cfcc43bbdebd47bfe0145bb662041472393c03b7802253993b6b7c90"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-win32.whl", hash = "sha256:9a06e046ffeb8a484279e54bda0a5abfd9675f594a2e38ef3133d7e4d75b6214"}, + {file = "SQLAlchemy-1.4.49-cp312-cp312-win_amd64.whl", hash = "sha256:7cf8b90ad84ad3a45098b1c9f56f2b161601e4670827d6b892ea0e884569bd1d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:36e58f8c4fe43984384e3fbe6341ac99b6b4e083de2fe838f0fdb91cebe9e9cb"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b31e67ff419013f99ad6f8fc73ee19ea31585e1e9fe773744c0f3ce58c039c30"}, + {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc22807a7e161c0d8f3da34018ab7c97ef6223578fcdd99b1d3e7ed1100a5db"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c14b29d9e1529f99efd550cd04dbb6db6ba5d690abb96d52de2bff4ed518bc95"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c40f3470e084d31247aea228aa1c39bbc0904c2b9ccbf5d3cfa2ea2dac06f26d"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win32.whl", hash = "sha256:706bfa02157b97c136547c406f263e4c6274a7b061b3eb9742915dd774bbc264"}, {file = "SQLAlchemy-1.4.49-cp36-cp36m-win_amd64.whl", hash = "sha256:a7f7b5c07ae5c0cfd24c2db86071fb2a3d947da7bd487e359cc91e67ac1c6d2e"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:4afbbf5ef41ac18e02c8dc1f86c04b22b7a2125f2a030e25bbb4aff31abb224b"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:24e300c0c2147484a002b175f4e1361f102e82c345bf263242f0449672a4bccf"}, + {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:393cd06c3b00b57f5421e2133e088df9cabcececcea180327e43b937b5a7caa5"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:201de072b818f8ad55c80d18d1a788729cccf9be6d9dc3b9d8613b053cd4836d"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7653ed6817c710d0c95558232aba799307d14ae084cc9b1f4c389157ec50df5c"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win32.whl", hash = "sha256:647e0b309cb4512b1f1b78471fdaf72921b6fa6e750b9f891e09c6e2f0e5326f"}, {file = "SQLAlchemy-1.4.49-cp37-cp37m-win_amd64.whl", hash = "sha256:ab73ed1a05ff539afc4a7f8cf371764cdf79768ecb7d2ec691e3ff89abbc541e"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:37ce517c011560d68f1ffb28af65d7e06f873f191eb3a73af5671e9c3fada08a"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1878ce508edea4a879015ab5215546c444233881301e97ca16fe251e89f1c55"}, + {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95ab792ca493891d7a45a077e35b418f68435efb3e1706cb8155e20e86a9013c"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e8e608983e6f85d0852ca61f97e521b62e67969e6e640fe6c6b575d4db68557"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf956da45290df6e809ea12c54c02ace7f8ff4d765d6d3dfb3655ee876ce58d"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win32.whl", hash = "sha256:f167c8175ab908ce48bd6550679cc6ea20ae169379e73c7720a28f89e53aa532"}, {file = "SQLAlchemy-1.4.49-cp38-cp38-win_amd64.whl", hash = "sha256:45806315aae81a0c202752558f0df52b42d11dd7ba0097bf71e253b4215f34f4"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:b6d0c4b15d65087738a6e22e0ff461b407533ff65a73b818089efc8eb2b3e1de"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a843e34abfd4c797018fd8d00ffffa99fd5184c421f190b6ca99def4087689bd"}, + {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:738d7321212941ab19ba2acf02a68b8ee64987b248ffa2101630e8fccb549e0d"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c890421651b45a681181301b3497e4d57c0d01dc001e10438a40e9a9c25ee77"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d26f280b8f0a8f497bc10573849ad6dc62e671d2468826e5c748d04ed9e670d5"}, {file = "SQLAlchemy-1.4.49-cp39-cp39-win32.whl", hash = "sha256:ec2268de67f73b43320383947e74700e95c6770d0c68c4e615e9897e46296294"}, @@ -8802,6 +9003,17 @@ files = [ {file = "types_PyYAML-6.0.12.11-py3-none-any.whl", hash = "sha256:a461508f3096d1d5810ec5ab95d7eeecb651f3a15b71959999988942063bf01d"}, ] +[[package]] +name = "types-regex" +version = "2024.5.15.20240519" +description = "Typing stubs for regex" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-regex-2024.5.15.20240519.tar.gz", hash = "sha256:ef3f594a95a95d6b9b5704a1facf3511a73e4731209ddb8868461db4c42dc12b"}, + {file = "types_regex-2024.5.15.20240519-py3-none-any.whl", hash = "sha256:d5895079cc66f91ae8818aeef14e9337c492ceb87ad0ff3df8c1c04d418cb9dd"}, +] + [[package]] name = "types-requests" version = "2.31.0.2" @@ -9445,4 +9657,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "9979b133732a91a49fe3014afde0f5e3455cbc26a129aecad6171672c3f0f4a9" +content-hash = "e517168f2ff67c46f3b37d7dcde88b73a1e2ae0d6890243b4c6d1e0aa504eff7" diff --git a/pyproject.toml b/pyproject.toml index cd4d6a78da..849626314a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.4.13a0" +version = "0.5.1a0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] @@ -152,6 +152,7 @@ types-pytz = ">=2024.1.0.20240203" ruff = "^0.3.2" pyjwt = "^2.8.0" pytest-mock = "^3.14.0" +types-regex = "^2024.5.15.20240519" [tool.poetry.group.pipeline] optional = true diff --git a/tests/common/normalizers/snake_no_x.py b/tests/common/cases/normalizers/snake_no_x.py similarity index 100% rename from tests/common/normalizers/snake_no_x.py rename to tests/common/cases/normalizers/snake_no_x.py diff --git a/tests/common/cases/normalizers/sql_upper.py b/tests/common/cases/normalizers/sql_upper.py index 992940d9a2..f2175f06ad 100644 --- a/tests/common/cases/normalizers/sql_upper.py +++ b/tests/common/cases/normalizers/sql_upper.py @@ -8,13 +8,11 @@ class NamingConvention(BaseNamingConvention): _CLEANUP_TABLE = str.maketrans(".\n\r'\"▶", "______") + @property + def is_case_sensitive(self) -> bool: + return True + def normalize_identifier(self, identifier: str) -> str: identifier = super().normalize_identifier(identifier) norm_identifier = identifier.translate(self._CLEANUP_TABLE).upper() return self.shorten_identifier(norm_identifier, identifier, self.max_length) - - def make_path(self, *identifiers: Any) -> str: - return self.PATH_SEPARATOR.join(filter(lambda x: x.strip(), identifiers)) - - def break_path(self, path: str) -> Sequence[str]: - return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] diff --git a/tests/common/cases/normalizers/title_case.py b/tests/common/cases/normalizers/title_case.py index 41eb96fcda..2b93b476c8 100644 --- a/tests/common/cases/normalizers/title_case.py +++ b/tests/common/cases/normalizers/title_case.py @@ -1,10 +1,11 @@ +from typing import ClassVar from dlt.common.normalizers.naming.direct import NamingConvention as DirectNamingConvention class NamingConvention(DirectNamingConvention): """Test case sensitive naming that capitalizes first and last letter and leaves the rest intact""" - PATH_SEPARATOR = "__" + PATH_SEPARATOR: ClassVar[str] = "__" def normalize_identifier(self, identifier: str) -> str: # keep prefix diff --git a/tests/common/normalizers/custom_normalizers.py b/tests/common/normalizers/custom_normalizers.py index 3ae65c8b53..4a0f456eef 100644 --- a/tests/common/normalizers/custom_normalizers.py +++ b/tests/common/normalizers/custom_normalizers.py @@ -11,6 +11,13 @@ def normalize_identifier(self, identifier: str) -> str: return "column_" + identifier.lower() +class ColumnNamingConvention(SnakeCaseNamingConvention): + def normalize_identifier(self, identifier: str) -> str: + if identifier.startswith("column_"): + return identifier + return "column_" + identifier.lower() + + class DataItemNormalizer(RelationalNormalizer): def extend_schema(self) -> None: json_config = self.schema._normalizers_config["json"]["config"] diff --git a/tests/common/normalizers/test_import_normalizers.py b/tests/common/normalizers/test_import_normalizers.py index 85bb8ca1cc..fe356de327 100644 --- a/tests/common/normalizers/test_import_normalizers.py +++ b/tests/common/normalizers/test_import_normalizers.py @@ -1,13 +1,23 @@ import os - import pytest from dlt.common.configuration.container import Container from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.normalizers.utils import explicit_normalizers, import_normalizers +from dlt.common.normalizers.typing import TNormalizersConfig +from dlt.common.normalizers.utils import ( + DEFAULT_NAMING_NAMESPACE, + explicit_normalizers, + import_normalizers, + naming_from_reference, + serialize_reference, +) from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer from dlt.common.normalizers.naming import snake_case, direct -from dlt.common.normalizers.naming.exceptions import InvalidNamingModule, UnknownNamingModule +from dlt.common.normalizers.naming.exceptions import ( + InvalidNamingType, + NamingTypeNotFound, + UnknownNamingModule, +) from tests.common.normalizers.custom_normalizers import ( DataItemNormalizer as CustomRelationalNormalizer, @@ -15,7 +25,7 @@ from tests.utils import preserve_environ -def test_default_normalizers() -> None: +def test_explicit_normalizers() -> None: config = explicit_normalizers() assert config["names"] is None assert config["json"] is None @@ -25,6 +35,12 @@ def test_default_normalizers() -> None: assert config["names"] == "direct" assert config["json"] == {"module": "custom"} + # pass modules and types, make sure normalizer config is serialized + config = explicit_normalizers(direct) + assert config["names"] == f"{DEFAULT_NAMING_NAMESPACE}.direct.NamingConvention" + config = explicit_normalizers(direct.NamingConvention) + assert config["names"] == f"{DEFAULT_NAMING_NAMESPACE}.direct.NamingConvention" + # use environ os.environ["SCHEMA__NAMING"] = "direct" os.environ["SCHEMA__JSON_NORMALIZER"] = '{"module": "custom"}' @@ -33,13 +49,75 @@ def test_default_normalizers() -> None: assert config["json"] == {"module": "custom"} -def test_default_normalizers_with_caps() -> None: +def test_explicit_normalizers_caps_ignored() -> None: # gets the naming convention from capabilities destination_caps = DestinationCapabilitiesContext.generic_capabilities() destination_caps.naming_convention = "direct" with Container().injectable_context(destination_caps): config = explicit_normalizers() - assert config["names"] == "direct" + assert config["names"] is None + + +def test_serialize_reference() -> None: + assert serialize_reference(None) is None + assert serialize_reference("module") == "module" + assert ( + serialize_reference(snake_case) == f"{DEFAULT_NAMING_NAMESPACE}.snake_case.NamingConvention" + ) + assert ( + serialize_reference(snake_case.NamingConvention) + == f"{DEFAULT_NAMING_NAMESPACE}.snake_case.NamingConvention" + ) + # test a wrong module and type + with pytest.raises(NamingTypeNotFound): + serialize_reference(pytest) + with pytest.raises(ValueError): + serialize_reference(Container) # type: ignore[arg-type] + + +def test_naming_from_reference() -> None: + assert naming_from_reference("snake_case").name() == "snake_case" + assert naming_from_reference("snake_case.NamingConvention").name() == "snake_case" + + # now not visible + with pytest.raises(UnknownNamingModule): + naming_from_reference("custom_normalizers") + + # temporarily add current file dir to paths and import module that clash with dlt predefined (no path) + import sys + + try: + sys.path.insert(0, os.path.dirname(__file__)) + assert naming_from_reference("custom_normalizers").name() == "custom_normalizers" + assert ( + naming_from_reference("custom_normalizers.NamingConvention").name() + == "custom_normalizers" + ) + assert ( + naming_from_reference("custom_normalizers.ColumnNamingConvention").name() + == "custom_normalizers" + ) + finally: + sys.path.pop(0) + + # non standard location + assert ( + naming_from_reference("dlt.destinations.impl.weaviate.naming").name() + == "dlt.destinations.impl.weaviate.naming" + ) + + # import module + assert naming_from_reference(snake_case).name() == "snake_case" + assert naming_from_reference(snake_case.NamingConvention).name() == "snake_case" + + with pytest.raises(ValueError): + naming_from_reference(snake_case.NamingConvention()) # type: ignore[arg-type] + + # with capabilities + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.max_identifier_length = 120 + naming = naming_from_reference(snake_case.NamingConvention, caps) + assert naming.max_length == 120 def test_import_normalizers() -> None: @@ -63,6 +141,29 @@ def test_import_normalizers() -> None: assert json_normalizer is CustomRelationalNormalizer +def test_import_normalizers_with_defaults() -> None: + explicit = explicit_normalizers() + default_: TNormalizersConfig = { + "names": "dlt.destinations.impl.weaviate.naming", + "json": {"module": "tests.common.normalizers.custom_normalizers"}, + } + config, naming, json_normalizer = import_normalizers(explicit, default_) + + assert config["names"] == "dlt.destinations.impl.weaviate.naming" + assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} + assert naming.name() == "dlt.destinations.impl.weaviate.naming" + assert json_normalizer is CustomRelationalNormalizer + + # correctly overrides + explicit["names"] = "sql_cs_v1" + explicit["json"] = {"module": "dlt.common.normalizers.json.relational"} + config, naming, json_normalizer = import_normalizers(explicit, default_) + assert config["names"] == "sql_cs_v1" + assert config["json"] == {"module": "dlt.common.normalizers.json.relational"} + assert naming.name() == "sql_cs_v1" + assert json_normalizer is RelationalNormalizer + + @pytest.mark.parametrize("sections", ("", "SOURCES__", "SOURCES__TEST_SCHEMA__")) def test_config_sections(sections: str) -> None: os.environ[f"{sections}SCHEMA__NAMING"] = "direct" @@ -84,6 +185,25 @@ def test_import_normalizers_with_caps() -> None: assert isinstance(naming, direct.NamingConvention) assert naming.max_length == 127 + _, naming, _ = import_normalizers(explicit_normalizers(snake_case)) + assert isinstance(naming, snake_case.NamingConvention) + assert naming.max_length == 127 + + # max table nesting generates relational normalizer + default_: TNormalizersConfig = { + "names": "dlt.destinations.impl.weaviate.naming", + "json": {"module": "tests.common.normalizers.custom_normalizers"}, + } + destination_caps.max_table_nesting = 0 + with Container().injectable_context(destination_caps): + config, _, relational = import_normalizers(explicit_normalizers()) + assert config["json"]["config"]["max_nesting"] == 0 + assert relational is RelationalNormalizer + + # wrong normalizer + config, _, relational = import_normalizers(explicit_normalizers(), default_) + assert "config" not in config["json"] + def test_import_invalid_naming_module() -> None: with pytest.raises(UnknownNamingModule) as py_ex: @@ -92,7 +212,7 @@ def test_import_invalid_naming_module() -> None: with pytest.raises(UnknownNamingModule) as py_ex: import_normalizers(explicit_normalizers("dlt.common.tests")) assert py_ex.value.naming_module == "dlt.common.tests" - with pytest.raises(InvalidNamingModule) as py_ex2: + with pytest.raises(InvalidNamingType) as py_ex2: import_normalizers(explicit_normalizers("dlt.pipeline.helpers")) assert py_ex2.value.naming_module == "dlt.pipeline" assert py_ex2.value.naming_class == "helpers" diff --git a/tests/common/normalizers/test_naming.py b/tests/common/normalizers/test_naming.py index 27325ab3cc..84d36537e6 100644 --- a/tests/common/normalizers/test_naming.py +++ b/tests/common/normalizers/test_naming.py @@ -2,13 +2,29 @@ import string from typing import List, Type -from dlt.common.normalizers.naming import NamingConvention -from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention -from dlt.common.normalizers.naming.direct import NamingConvention as DirectNamingConvention +from dlt.common.normalizers.naming import ( + NamingConvention, + snake_case, + direct, + duck_case, + sql_ci_v1, + sql_cs_v1, +) from dlt.common.typing import DictStrStr from dlt.common.utils import uniq_id +ALL_NAMING_CONVENTIONS = { + snake_case.NamingConvention, + direct.NamingConvention, + duck_case.NamingConvention, + sql_ci_v1.NamingConvention, + sql_cs_v1.NamingConvention, +} + +ALL_UNDERSCORE_PATH_CONVENTIONS = ALL_NAMING_CONVENTIONS - {direct.NamingConvention} + + LONG_PATH = "prospects_external_data__data365_member__member__feed_activities_created_post__items__comments__items__comments__items__author_details__educations" DENSE_PATH = "__".join(string.ascii_lowercase) LONG_IDENT = 10 * string.printable @@ -139,7 +155,7 @@ def test_shorten_identifier() -> None: assert len(norm_ident) == 20 -@pytest.mark.parametrize("convention", (SnakeCaseNamingConvention, DirectNamingConvention)) +@pytest.mark.parametrize("convention", ALL_NAMING_CONVENTIONS) def test_normalize_with_shorten_identifier(convention: Type[NamingConvention]) -> None: naming = convention() # None/empty ident raises @@ -164,7 +180,7 @@ def test_normalize_with_shorten_identifier(convention: Type[NamingConvention]) - assert tag in naming.normalize_identifier(RAW_IDENT) -@pytest.mark.parametrize("convention", (SnakeCaseNamingConvention, DirectNamingConvention)) +@pytest.mark.parametrize("convention", ALL_NAMING_CONVENTIONS) def test_normalize_path_shorting(convention: Type[NamingConvention]) -> None: naming = convention() path = naming.make_path(*LONG_PATH.split("__")) @@ -207,10 +223,11 @@ def test_normalize_path_shorting(convention: Type[NamingConvention]) -> None: assert len(naming.break_path(norm_path)) == 1 -@pytest.mark.parametrize("convention", (SnakeCaseNamingConvention, DirectNamingConvention)) +@pytest.mark.parametrize("convention", ALL_NAMING_CONVENTIONS) def test_normalize_path(convention: Type[NamingConvention]) -> None: naming = convention() raw_path_str = naming.make_path(*RAW_PATH) + assert convention.PATH_SEPARATOR in raw_path_str # count separators norm_path_str = naming.normalize_path(raw_path_str) assert len(naming.break_path(norm_path_str)) == len(RAW_PATH) @@ -248,7 +265,7 @@ def test_normalize_path(convention: Type[NamingConvention]) -> None: assert tag in tagged_raw_path_str -@pytest.mark.parametrize("convention", (SnakeCaseNamingConvention, DirectNamingConvention)) +@pytest.mark.parametrize("convention", ALL_NAMING_CONVENTIONS) def test_shorten_fragments(convention: Type[NamingConvention]) -> None: # max length around the length of the path naming = convention() @@ -266,9 +283,30 @@ def test_shorten_fragments(convention: Type[NamingConvention]) -> None: assert naming.shorten_fragments(*RAW_PATH_WITH_EMPTY_IDENT) == norm_path +@pytest.mark.parametrize("convention", ALL_UNDERSCORE_PATH_CONVENTIONS) +def test_normalize_break_path(convention: Type[NamingConvention]) -> None: + naming_unlimited = convention() + assert naming_unlimited.break_path("A__B__C") == ["A", "B", "C"] + # what if path has _a and _b which valid normalized idents + assert naming_unlimited.break_path("_a___b__C___D") == ["_a", "_b", "C", "_D"] + # skip empty identifiers from path + assert naming_unlimited.break_path("_a_____b") == ["_a", "_b"] + assert naming_unlimited.break_path("_a____b") == ["_a", "b"] + assert naming_unlimited.break_path("_a__ \t\r__b") == ["_a", "b"] + + +@pytest.mark.parametrize("convention", ALL_UNDERSCORE_PATH_CONVENTIONS) +def test_normalize_make_path(convention: Type[NamingConvention]) -> None: + naming_unlimited = convention() + assert naming_unlimited.make_path("A", "B") == "A__B" + assert naming_unlimited.make_path("_A", "_B") == "_A___B" + assert naming_unlimited.make_path("_A", "", "_B") == "_A___B" + assert naming_unlimited.make_path("_A", "\t\n ", "_B") == "_A___B" + + def test_naming_convention_name() -> None: - assert SnakeCaseNamingConvention.name() == "snake_case" - assert DirectNamingConvention.name() == "direct" + assert snake_case.NamingConvention.name() == "snake_case" + assert direct.NamingConvention.name() == "direct" def assert_short_path(norm_path: str, naming: NamingConvention) -> None: diff --git a/tests/common/normalizers/test_naming_snake_case.py b/tests/common/normalizers/test_naming_snake_case.py index 6d619b5257..ee4f43e7f0 100644 --- a/tests/common/normalizers/test_naming_snake_case.py +++ b/tests/common/normalizers/test_naming_snake_case.py @@ -1,9 +1,7 @@ -from typing import Type import pytest from dlt.common.normalizers.naming import NamingConvention from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention -from dlt.common.normalizers.naming.duck_case import NamingConvention as DuckCaseNamingConvention @pytest.fixture @@ -54,30 +52,9 @@ def test_normalize_path(naming_unlimited: NamingConvention) -> None: def test_normalize_non_alpha_single_underscore() -> None: - assert SnakeCaseNamingConvention._RE_NON_ALPHANUMERIC.sub("_", "-=!*") == "_" - assert SnakeCaseNamingConvention._RE_NON_ALPHANUMERIC.sub("_", "1-=!0*-") == "1_0_" - assert SnakeCaseNamingConvention._RE_NON_ALPHANUMERIC.sub("_", "1-=!_0*-") == "1__0_" - - -@pytest.mark.parametrize("convention", (SnakeCaseNamingConvention, DuckCaseNamingConvention)) -def test_normalize_break_path(convention: Type[NamingConvention]) -> None: - naming_unlimited = convention() - assert naming_unlimited.break_path("A__B__C") == ["A", "B", "C"] - # what if path has _a and _b which valid normalized idents - assert naming_unlimited.break_path("_a___b__C___D") == ["_a", "_b", "C", "_D"] - # skip empty identifiers from path - assert naming_unlimited.break_path("_a_____b") == ["_a", "_b"] - assert naming_unlimited.break_path("_a____b") == ["_a", "b"] - assert naming_unlimited.break_path("_a__ \t\r__b") == ["_a", "b"] - - -@pytest.mark.parametrize("convention", (SnakeCaseNamingConvention, DuckCaseNamingConvention)) -def test_normalize_make_path(convention: Type[NamingConvention]) -> None: - naming_unlimited = convention() - assert naming_unlimited.make_path("A", "B") == "A__B" - assert naming_unlimited.make_path("_A", "_B") == "_A___B" - assert naming_unlimited.make_path("_A", "", "_B") == "_A___B" - assert naming_unlimited.make_path("_A", "\t\n ", "_B") == "_A___B" + assert SnakeCaseNamingConvention.RE_NON_ALPHANUMERIC.sub("_", "-=!*") == "_" + assert SnakeCaseNamingConvention.RE_NON_ALPHANUMERIC.sub("_", "1-=!0*-") == "1_0_" + assert SnakeCaseNamingConvention.RE_NON_ALPHANUMERIC.sub("_", "1-=!_0*-") == "1__0_" def test_normalizes_underscores(naming_unlimited: NamingConvention) -> None: diff --git a/tests/common/normalizers/test_naming_sql.py b/tests/common/normalizers/test_naming_sql.py new file mode 100644 index 0000000000..c290354c6a --- /dev/null +++ b/tests/common/normalizers/test_naming_sql.py @@ -0,0 +1,55 @@ +import pytest +from typing import Type +from dlt.common.normalizers.naming import NamingConvention, sql_ci_v1, sql_cs_v1 + +ALL_NAMING_CONVENTIONS = {sql_ci_v1.NamingConvention, sql_cs_v1.NamingConvention} + + +@pytest.mark.parametrize("convention", ALL_NAMING_CONVENTIONS) +def test_normalize_identifier(convention: Type[NamingConvention]) -> None: + naming = convention() + assert naming.normalize_identifier("event_value") == "event_value" + assert naming.normalize_identifier("event value") == "event_value" + assert naming.normalize_identifier("event-.!:*<>value") == "event_value" + # prefix leading digits + assert naming.normalize_identifier("1event_n'") == "_1event_n" + # remove trailing underscores + assert naming.normalize_identifier("123event_n'") == "_123event_n" + # contract underscores + assert naming.normalize_identifier("___a___b") == "_a_b" + # trim spaces + assert naming.normalize_identifier(" small love potion ") == "small_love_potion" + + # special characters converted to _ + assert naming.normalize_identifier("+-!$*@#=|:") == "_" + # leave single underscore + assert naming.normalize_identifier("_") == "_" + # some other cases + assert naming.normalize_identifier("+1") == "_1" + assert naming.normalize_identifier("-1") == "_1" + + +def test_case_sensitive_normalize() -> None: + naming = sql_cs_v1.NamingConvention() + # all lowercase and converted to snake + assert naming.normalize_identifier("123BaNaNa") == "_123BaNaNa" + # consecutive capital letters + assert naming.normalize_identifier("BANANA") == "BANANA" + assert naming.normalize_identifier("BAN_ANA") == "BAN_ANA" + assert naming.normalize_identifier("BANaNA") == "BANaNA" + # handling spaces + assert naming.normalize_identifier("Small Love Potion") == "Small_Love_Potion" + assert naming.normalize_identifier(" Small Love Potion ") == "Small_Love_Potion" + + +def test_case_insensitive_normalize() -> None: + naming = sql_ci_v1.NamingConvention() + # all lowercase and converted to snake + assert naming.normalize_identifier("123BaNaNa") == "_123banana" + # consecutive capital letters + assert naming.normalize_identifier("BANANA") == "banana" + assert naming.normalize_identifier("BAN_ANA") == "ban_ana" + assert naming.normalize_identifier("BANaNA") == "banana" + # handling spaces + assert naming.normalize_identifier("Small Love Potion") == "small_love_potion" + assert naming.normalize_identifier(" Small Love Potion ") == "small_love_potion" diff --git a/tests/common/schema/test_normalize_identifiers.py b/tests/common/schema/test_normalize_identifiers.py index b71977a5fd..60f8c04604 100644 --- a/tests/common/schema/test_normalize_identifiers.py +++ b/tests/common/schema/test_normalize_identifiers.py @@ -280,25 +280,26 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non schema_storage_no_import.save_schema(orig_schema) with Container().injectable_context( - DestinationCapabilitiesContext.generic_capabilities( - naming_convention=sql_upper.NamingConvention() - ) + DestinationCapabilitiesContext.generic_capabilities(naming_convention=sql_upper) ) as caps: - assert isinstance(caps.naming_convention, sql_upper.NamingConvention) + assert caps.naming_convention is sql_upper # creating a schema from dict keeps original normalizers schema = Schema.from_dict(eth_V9) assert_schema_identifiers_case(schema, str.lower) - assert schema._normalizers_config["names"].endswith("snake_case") # type: ignore + assert schema._normalizers_config["names"].endswith("snake_case") # loading from storage keeps storage normalizers storage_schema = schema_storage_no_import.load_schema("ethereum") assert_schema_identifiers_case(storage_schema, str.lower) - assert storage_schema._normalizers_config["names"].endswith("snake_case") # type: ignore + assert storage_schema._normalizers_config["names"].endswith("snake_case") # new schema instance is created using caps/config new_schema = Schema("new") assert_schema_identifiers_case(new_schema, str.upper) - assert isinstance(new_schema._normalizers_config["names"], NamingConvention) + assert ( + new_schema._normalizers_config["names"] + == "tests.common.cases.normalizers.sql_upper.NamingConvention" + ) # attempt to update normalizers blocked by tables with data with pytest.raises(TableIdentifiersFrozen): @@ -310,14 +311,20 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non # remove processing hints and normalize norm_cloned = schema.clone(update_normalizers=True, remove_processing_hints=True) assert_schema_identifiers_case(norm_cloned, str.upper) - assert isinstance(norm_cloned._normalizers_config["names"], NamingConvention) + assert ( + norm_cloned._normalizers_config["names"] + == "tests.common.cases.normalizers.sql_upper.NamingConvention" + ) norm_schema = Schema.from_dict( deepcopy(eth_V9), remove_processing_hints=True, bump_version=False ) norm_schema.update_normalizers() assert_schema_identifiers_case(norm_schema, str.upper) - assert isinstance(norm_schema._normalizers_config["names"], NamingConvention) + assert ( + norm_schema._normalizers_config["names"] + == "tests.common.cases.normalizers.sql_upper.NamingConvention" + ) # both ways of obtaining schemas (cloning, cleaning dict) must generate identical schemas assert norm_cloned.to_pretty_json() == norm_schema.to_pretty_json() @@ -329,7 +336,7 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non storage_schema = schema_storage_no_import.load_schema("ethereum") assert_schema_identifiers_case(storage_schema, str.upper) # the instance got converted into - assert storage_schema._normalizers_config["names"].endswith("sql_upper.NamingConvention") # type: ignore + assert storage_schema._normalizers_config["names"].endswith("sql_upper.NamingConvention") assert storage_schema.stored_version_hash == storage_schema.version_hash # cloned when bumped must have same version hash norm_cloned._bump_version() @@ -355,7 +362,7 @@ def test_raise_on_change_identifier_table_with_data() -> None: # use special naming convention that only changes column names ending with x to _ issues_table["columns"]["columnx"] = {"name": "columnx", "data_type": "bigint"} assert schema.tables["issues"] is issues_table - os.environ["SCHEMA__NAMING"] = "tests.common.normalizers.snake_no_x" + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.snake_no_x" with pytest.raises(TableIdentifiersFrozen) as fr_ex: schema.update_normalizers() assert fr_ex.value.table_name == "issues" From a23435f5c620ae826c70557f6d00b3bd0581ad19 Mon Sep 17 00:00:00 2001 From: Pablo Castellano Date: Mon, 1 Jul 2024 11:18:05 +0200 Subject: [PATCH 43/61] docs: Fixed markdown issue in duckdb.md (#1528) --- docs/website/docs/dlt-ecosystem/destinations/duckdb.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 1e3d6b8403..023f3e35bc 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -37,7 +37,7 @@ All write dispositions are supported. ### Names normalization `dlt` uses the standard **snake_case** naming convention to keep identical table and column identifiers across all destinations. If you want to use the **duckdb** wide range of characters (i.e., emojis) for table and column names, you can switch to the **duck_case** naming convention, which accepts almost any string as an identifier: -* `\n` `\r` and `" are translated to `_` +* `\n` `\r` and `"` are translated to `_` * multiple `_` are translated to a single `_` Switch the naming convention using `config.toml`: From 653673cc43e4a622f44005d72f570439cb9fb249 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Tue, 2 Jul 2024 10:50:34 +0400 Subject: [PATCH 44/61] remove obsolete dremio destination capabilities (#1527) --- dlt/destinations/impl/dremio/__init__.py | 30 ------------------------ 1 file changed, 30 deletions(-) diff --git a/dlt/destinations/impl/dremio/__init__.py b/dlt/destinations/impl/dremio/__init__.py index 96d4748f1d..e69de29bb2 100644 --- a/dlt/destinations/impl/dremio/__init__.py +++ b/dlt/destinations/impl/dremio/__init__.py @@ -1,30 +0,0 @@ -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.common.data_writers.escape import escape_dremio_identifier -from dlt.common.destination import DestinationCapabilitiesContext - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = None - caps.supported_loader_file_formats = [] - caps.preferred_staging_file_format = "parquet" - caps.supported_staging_file_formats = ["jsonl", "parquet"] - caps.escape_identifier = escape_dremio_identifier - # all identifiers are case insensitive but are stored as is - # https://docs.dremio.com/current/sonar/data-sources - caps.has_case_sensitive_identifiers = False - caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) - caps.max_identifier_length = 255 - caps.max_column_identifier_length = 255 - caps.max_query_length = 2 * 1024 * 1024 - caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 16 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = True - caps.supports_transactions = False - caps.supports_ddl_transactions = False - caps.alter_add_multi_column = True - caps.supports_clone_table = False - caps.supports_multiple_statements = False - caps.timestamp_precision = 3 - return caps From b6d08aa21672f50c0b97ff5d408a2ac09282b929 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Tue, 2 Jul 2024 11:41:22 +0200 Subject: [PATCH 45/61] rest_api: add a quick example to rest_api docs (#1531) --- .../verified-sources/rest_api.md | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md index 11d09c89f7..96cbe3b87d 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api.md @@ -9,6 +9,61 @@ import Header from './_source-info-header.md'; This is a generic dlt source you can use to extract data from any REST API. It uses [declarative configuration](#source-configuration) to define the API endpoints, their [relationships](#define-resource-relationships), how to handle [pagination](#pagination), and [authentication](#authentication). +### Quick example + +Here's an example of how to configure the REST API source to load posts and related comments from a hypothetical blog API: + +```py +import dlt +from rest_api import rest_api_source + +source = rest_api_source({ + "client": { + "base_url": "https://api.example.com/", + "auth": { + "token": dlt.secrets["your_api_token"], + }, + "paginator": { + "type": "json_response", + "next_url_path": "paging.next", + }, + }, + "resources": [ + # "posts" will be used as the endpoint path, the resource name, + # and the table name in the destination. The HTTP client will send + # a request to "https://api.example.com/posts". + "posts", + + # The explicit configuration allows you to link resources + # and define parameters. + { + "name": "comments", + "endpoint": { + "path": "posts/{post_id}/comments", + "params": { + "post_id": { + "type": "resolve", + "resource": "posts", + "field": "id", + }, + "sort": "created_at", + }, + }, + }, + ], +}) + +pipeline = dlt.pipeline( + pipeline_name="rest_api_example", + destination="duckdb", + dataset_name="rest_api_data", +) + +load_info = pipeline.run(source) +``` + +Running this pipeline will create two tables in the DuckDB: `posts` and `comments` with the data from the respective API endpoints. The `comments` resource will fetch comments for each post by using the `id` field from the `posts` resource. + ## Setup guide ### Initialize the verified source From 31c9995fcc3bce57466503d69d2b9b544e6ccf47 Mon Sep 17 00:00:00 2001 From: Axell <68310020+axellpadilla@users.noreply.github.com> Date: Wed, 3 Jul 2024 02:48:04 -0600 Subject: [PATCH 46/61] Update grouping-resources.md docs (#1538) --- docs/website/docs/tutorial/grouping-resources.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/tutorial/grouping-resources.md b/docs/website/docs/tutorial/grouping-resources.md index 3ba95b7971..49bb5c3e8d 100644 --- a/docs/website/docs/tutorial/grouping-resources.md +++ b/docs/website/docs/tutorial/grouping-resources.md @@ -106,7 +106,7 @@ You've noticed that there's a lot of code duplication in the `get_issues` and `g ```py import dlt -from dlt.sources.helpers import requests +from dlt.sources.helpers import paginate BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" @@ -231,7 +231,7 @@ The next step is to make our dlt GitHub source reusable so it can load data from ```py import dlt -from dlt.sources.helpers import requests +from dlt.sources.helpers import paginate BASE_GITHUB_URL = "https://api.github.com/repos/{repo_name}" From b5e9c9aa934b97ace9dae45b7f4ff0e608307bab Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 4 Jul 2024 13:46:06 +0200 Subject: [PATCH 47/61] fixes rest_client imports in grouping-resources snippets --- docs/website/docs/tutorial/grouping-resources.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/tutorial/grouping-resources.md b/docs/website/docs/tutorial/grouping-resources.md index 49bb5c3e8d..2bbfd231f2 100644 --- a/docs/website/docs/tutorial/grouping-resources.md +++ b/docs/website/docs/tutorial/grouping-resources.md @@ -106,7 +106,7 @@ You've noticed that there's a lot of code duplication in the `get_issues` and `g ```py import dlt -from dlt.sources.helpers import paginate +from dlt.sources.helpers.rest_client import paginate BASE_GITHUB_URL = "https://api.github.com/repos/dlt-hub/dlt" @@ -231,7 +231,7 @@ The next step is to make our dlt GitHub source reusable so it can load data from ```py import dlt -from dlt.sources.helpers import paginate +from dlt.sources.helpers.rest_client import paginate BASE_GITHUB_URL = "https://api.github.com/repos/{repo_name}" From eb834067965c52182ec96fe100dc1224243a00c4 Mon Sep 17 00:00:00 2001 From: Ilya Gurov Date: Thu, 4 Jul 2024 17:08:59 +0400 Subject: [PATCH 48/61] feat(filesystem): use only netloc and scheme for fingerprint (#1516) --- dlt/common/storages/configuration.py | 15 +++++++++++++-- tests/load/filesystem/test_filesystem_client.py | 16 +++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index 09beb0015e..b2bdb3a7b6 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -111,8 +111,19 @@ def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: return self.PROTOCOL_CREDENTIALS.get(self.protocol) or Optional[CredentialsConfiguration] # type: ignore[return-value] def fingerprint(self) -> str: - """Returns a fingerprint of bucket_url""" - return digest128(self.bucket_url) if self.bucket_url else "" + """Returns a fingerprint of bucket schema and netloc. + + Returns: + str: Fingerprint. + """ + if not self.bucket_url: + return "" + + if self.is_local_path(self.bucket_url): + return digest128("") + + uri = urlparse(self.bucket_url) + return digest128(self.bucket_url.replace(uri.path, "")) def __str__(self) -> str: """Return displayable destination location""" diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index 597d400344..f16e75c7e6 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -48,11 +48,17 @@ def logger_autouse() -> None: ] -def test_filesystem_destination_configuration() -> None: - assert FilesystemDestinationClientConfiguration().fingerprint() == "" - assert FilesystemDestinationClientConfiguration( - bucket_url="s3://cool" - ).fingerprint() == digest128("s3://cool") +@pytest.mark.parametrize( + "url, exp", + ( + (None, ""), + ("/path/path2", digest128("")), + ("s3://cool", digest128("s3://cool")), + ("s3://cool.domain/path/path2", digest128("s3://cool.domain")), + ), +) +def test_filesystem_destination_configuration(url, exp) -> None: + assert FilesystemDestinationClientConfiguration(bucket_url=url).fingerprint() == exp def test_filesystem_factory_buckets(with_gdrive_buckets_env: str) -> None: From 6aedfdd379f7457c391384b0422c8545d18218f3 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Thu, 4 Jul 2024 15:52:48 +0200 Subject: [PATCH 49/61] removes deprecated credentials argument from Pipeline (#1537) * removes deprected credentials argument from Pipeline * fixes dependency in tests * fixes explicit creds tests dependencies --- dlt/common/destination/reference.py | 2 +- dlt/pipeline/__init__.py | 17 +---- dlt/pipeline/pipeline.py | 74 +++++++------------ dlt/pipeline/warnings.py | 17 ----- docs/examples/archive/credentials/explicit.py | 5 +- docs/examples/archive/quickstart.py | 5 +- .../docs/dlt-ecosystem/destinations/duckdb.md | 3 +- tests/extract/test_incremental.py | 9 ++- .../airflow_tests/test_airflow_wrapper.py | 59 ++++++--------- .../test_join_airflow_scheduler.py | 3 +- tests/load/duckdb/test_duckdb_client.py | 14 ++-- tests/load/filesystem/test_aws_credentials.py | 18 +++++ tests/load/filesystem/test_gcs_credentials.py | 32 ++++++++ tests/load/pipeline/test_pipelines.py | 60 ++++++++++----- tests/pipeline/test_pipeline.py | 54 +++----------- tests/pipeline/test_schema_contracts.py | 3 +- 16 files changed, 177 insertions(+), 198 deletions(-) create mode 100644 tests/load/filesystem/test_gcs_credentials.py diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 90f89b85d7..a735aad5cf 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -618,7 +618,7 @@ def normalize_type(destination_type: str) -> str: @staticmethod def from_reference( ref: TDestinationReferenceArg, - credentials: Optional[CredentialsConfiguration] = None, + credentials: Optional[Any] = None, destination_name: Optional[str] = None, environment: Optional[str] = None, **kwargs: Any, diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 4efc7716e6..8041ca72e0 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -14,7 +14,7 @@ from dlt.pipeline.configuration import PipelineConfiguration, ensure_correct_pipeline_kwargs from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.progress import _from_name as collector_from_name, TCollectorArg, _NULL_COLLECTOR -from dlt.pipeline.warnings import credentials_argument_deprecated, full_refresh_argument_deprecated +from dlt.pipeline.warnings import full_refresh_argument_deprecated TPipeline = TypeVar("TPipeline", bound=Pipeline, default=Pipeline) @@ -32,7 +32,6 @@ def pipeline( full_refresh: Optional[bool] = None, dev_mode: bool = False, refresh: Optional[TRefreshMode] = None, - credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, _impl_cls: Type[TPipeline] = Pipeline, # type: ignore[assignment] ) -> TPipeline: @@ -78,9 +77,6 @@ def pipeline( * `drop_resources`: Drop tables and resource state for all resources being processed. Source level state is not modified. (Note: schema history is erased) * `drop_data`: Wipe all data and resource state for all resources being processed. Schema is not modified. - credentials (Any, optional): Credentials for the `destination` ie. database connection string or a dictionary with google cloud credentials. - In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer right credentials values. - progress(str, Collector): A progress monitor that shows progress bars, console or log messages with current information on sources, resources, data items etc. processed in `extract`, `normalize` and `load` stage. Pass a string with a collector name or configure your own by choosing from `dlt.progress` module. We support most of the progress libraries: try passing `tqdm`, `enlighten` or `alive_progress` or `log` to write to console/log. @@ -109,7 +105,6 @@ def pipeline( full_refresh: Optional[bool] = None, dev_mode: bool = False, refresh: Optional[TRefreshMode] = None, - credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, _impl_cls: Type[TPipeline] = Pipeline, # type: ignore[assignment] **injection_kwargs: Any, @@ -120,7 +115,6 @@ def pipeline( # is any of the arguments different from defaults has_arguments = bool(orig_args[0]) or any(orig_args[1].values()) - credentials_argument_deprecated("pipeline", credentials, destination) full_refresh_argument_deprecated("pipeline", full_refresh) if not has_arguments: @@ -153,7 +147,6 @@ def pipeline( destination, staging, dataset_name, - credentials, import_schema_path, export_schema_path, full_refresh if full_refresh is not None else dev_mode, @@ -204,7 +197,6 @@ def attach( None, None, None, - None, False, # always False as dev_mode so we do not wipe the working folder progress, True, @@ -222,7 +214,6 @@ def run( destination: TDestinationReferenceArg = None, staging: TDestinationReferenceArg = None, dataset_name: str = None, - credentials: Any = None, table_name: str = None, write_disposition: TWriteDispositionConfig = None, columns: Sequence[TColumnSchema] = None, @@ -257,9 +248,6 @@ def run( dataset_name (str, optional):A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. `schema` in relational databases or folder grouping many files. If not provided, the value passed to `dlt.pipeline` will be used. If not provided at all then defaults to the `pipeline_name` - credentials (Any, optional): Credentials for the `destination` ie. database connection string or a dictionary with google cloud credentials. - In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer right credentials values. - table_name (str, optional): The name of the table to which the data should be loaded within the `dataset`. This argument is required for a `data` that is a list/Iterable or Iterator without `__name__` attribute. The behavior of this argument depends on the type of the `data`: * generator functions: the function name is used as table name, `table_name` overrides this default @@ -280,13 +268,12 @@ def run( Returns: LoadInfo: Information on loaded data including the list of package ids and failed job statuses. Please not that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. """ - destination = Destination.from_reference(destination, credentials=credentials) + destination = Destination.from_reference(destination) return pipeline().run( data, destination=destination, staging=staging, dataset_name=dataset_name, - credentials=credentials, table_name=table_name, write_disposition=write_disposition, columns=columns, diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 11f8d6223e..e4a7c7c4a8 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -23,14 +23,13 @@ from dlt.common.json import json from dlt.common.pendulum import pendulum from dlt.common.configuration import inject_section, known_sections -from dlt.common.configuration.specs import RunConfiguration, CredentialsConfiguration +from dlt.common.configuration.specs import RunConfiguration from dlt.common.configuration.container import Container from dlt.common.configuration.exceptions import ( ConfigFieldMissingException, ContextDefaultCannotBeCreated, ) from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.configuration.resolve import initialize_credentials from dlt.common.destination.exceptions import ( DestinationIncompatibleLoaderFileFormatException, DestinationNoStagingMode, @@ -145,7 +144,6 @@ state_resource, default_pipeline_state, ) -from dlt.pipeline.warnings import credentials_argument_deprecated from dlt.common.storages.load_package import TLoadPackageState from dlt.pipeline.helpers import refresh_source @@ -307,7 +305,6 @@ class Pipeline(SupportsPipeline): """The destination reference which is the Destination Class. `destination.destination_name` returns the name string""" dataset_name: str = None """Name of the dataset to which pipeline will be loaded to""" - credentials: Any = None is_active: bool = False """Tells if instance is currently active and available via dlt.pipeline()""" collector: _Collector @@ -323,7 +320,6 @@ def __init__( destination: TDestination, staging: TDestination, dataset_name: str, - credentials: Any, import_schema_path: str, export_schema_path: str, dev_mode: bool, @@ -357,7 +353,6 @@ def __init__( self._init_working_dir(pipeline_name, pipelines_dir) with self.managed_state() as state: - self.credentials = credentials self._configure(import_schema_path, export_schema_path, must_attach_to_local_pipeline) # changing the destination could be dangerous if pipeline has pending load packages self._set_destinations(destination=destination, staging=staging, initializing=True) @@ -382,7 +377,6 @@ def drop(self, pipeline_name: str = None) -> "Pipeline": self.destination, self.staging, self.dataset_name, - self.credentials, self._schema_storage.config.import_schema_path, self._schema_storage.config.export_schema_path, self.dev_mode, @@ -533,15 +527,13 @@ def load( workers: int = 20, raise_on_failed_jobs: bool = False, ) -> LoadInfo: - """Loads the packages prepared by `normalize` method into the `dataset_name` at `destination`, using provided `credentials`""" + """Loads the packages prepared by `normalize` method into the `dataset_name` at `destination`, optionally using provided `credentials`""" # set destination and default dataset if provided (this is the reason we have state sync here) - self._set_destinations(destination=destination, staging=None) + self._set_destinations( + destination=destination, destination_credentials=credentials, staging=None + ) self._set_dataset_name(dataset_name) - credentials_argument_deprecated("pipeline.load", credentials, destination) - - self.credentials = credentials or self.credentials - # check if any schema is present, if not then no data was extracted if not self.default_schema_name: return None @@ -623,7 +615,6 @@ def run( dataset_name (str, optional):A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. `schema` in relational databases or folder grouping many files. If not provided, the value passed to `dlt.pipeline` will be used. If not provided at all then defaults to the `pipeline_name` - credentials (Any, optional): Credentials for the `destination` ie. database connection string or a dictionary with google cloud credentials. In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer right credentials values. @@ -661,11 +652,11 @@ def run( signals.raise_if_signalled() self.activate() - self._set_destinations(destination=destination, staging=staging) + self._set_destinations( + destination=destination, destination_credentials=credentials, staging=staging + ) self._set_dataset_name(dataset_name) - credentials_argument_deprecated("pipeline.run", credentials, self.destination) - # sync state with destination if ( self.config.restore_from_destination @@ -926,7 +917,7 @@ def drop_pending_packages(self, with_partial_loads: bool = True) -> None: normalize_storage.extracted_packages.delete_package(load_id) @with_schemas_sync - def sync_schema(self, schema_name: str = None, credentials: Any = None) -> TSchemaTables: + def sync_schema(self, schema_name: str = None) -> TSchemaTables: """Synchronizes the schema `schema_name` with the destination. If no name is provided, the default schema will be synchronized.""" if not schema_name and not self.default_schema_name: raise PipelineConfigMissing( @@ -962,7 +953,7 @@ def get_local_state_val(self, key: str) -> Any: state = self._get_state() return state["_local"][key] # type: ignore - def sql_client(self, schema_name: str = None, credentials: Any = None) -> SqlClientBase[Any]: + def sql_client(self, schema_name: str = None) -> SqlClientBase[Any]: """Returns a sql client configured to query/change the destination and dataset that were used to load the data. Use the client with `with` statement to manage opening and closing connection to the destination: >>> with pipeline.sql_client() as client: @@ -972,7 +963,7 @@ def sql_client(self, schema_name: str = None, credentials: Any = None) -> SqlCli >>> print(cursor.fetchall()) The client is authenticated and defaults all queries to dataset_name used by the pipeline. You can provide alternative - `schema_name` which will be used to normalize dataset name and alternative `credentials`. + `schema_name` which will be used to normalize dataset name. """ # if not self.default_schema_name and not schema_name: # raise PipelineConfigMissing( @@ -982,9 +973,9 @@ def sql_client(self, schema_name: str = None, credentials: Any = None) -> SqlCli # "Sql Client is not available in a pipeline without a default schema. Extract some data first or restore the pipeline from the destination using 'restore_from_destination' flag. There's also `_inject_schema` method for advanced users." # ) schema = self._get_schema_or_create(schema_name) - return self._sql_job_client(schema, credentials).sql_client + return self._sql_job_client(schema).sql_client - def _fs_client(self, schema_name: str = None, credentials: Any = None) -> FSClientBase: + def _fs_client(self, schema_name: str = None) -> FSClientBase: """Returns a filesystem client configured to point to the right folder / bucket for each table. For example you may read all parquet files as bytes for one table with the following code: >>> files = pipeline._fs_client.list_table_files("customers") @@ -996,18 +987,18 @@ def _fs_client(self, schema_name: str = None, credentials: Any = None) -> FSClie NOTE: This currently is considered a private endpoint and will become stable after we have decided on the interface of FSClientBase. """ - client = self.destination_client(schema_name, credentials) + client = self.destination_client(schema_name) if isinstance(client, FSClientBase): return client raise FSClientNotAvailable(self.pipeline_name, self.destination.destination_name) - def destination_client(self, schema_name: str = None, credentials: Any = None) -> JobClientBase: + def destination_client(self, schema_name: str = None) -> JobClientBase: """Get the destination job client for the configured destination Use the client with `with` statement to manage opening and closing connection to the destination: >>> with pipeline.destination_client() as client: >>> client.drop_storage() # removes storage which typically wipes all data in it - The client is authenticated. You can provide alternative `schema_name` which will be used to normalize dataset name and alternative `credentials`. + The client is authenticated. You can provide alternative `schema_name` which will be used to normalize dataset name. If no schema name is provided and no default schema is present in the pipeline, and ad hoc schema will be created and discarded after use. """ schema = self._get_schema_or_create(schema_name) @@ -1021,8 +1012,8 @@ def _get_schema_or_create(self, schema_name: str = None) -> Schema: with self._maybe_destination_capabilities(): return Schema(self.pipeline_name) - def _sql_job_client(self, schema: Schema, credentials: Any = None) -> SqlJobClientBase: - client_config = self._get_destination_client_initial_config(credentials=credentials) + def _sql_job_client(self, schema: Schema) -> SqlJobClientBase: + client_config = self._get_destination_client_initial_config() client = self._get_destination_clients(schema, client_config)[0] if isinstance(client, SqlJobClientBase): return client @@ -1154,7 +1145,7 @@ def _extract_source( return load_id def _get_destination_client_initial_config( - self, destination: TDestination = None, credentials: Any = None, as_staging: bool = False + self, destination: TDestination = None, as_staging: bool = False ) -> DestinationClientConfiguration: destination = destination or self.destination if not destination: @@ -1165,19 +1156,9 @@ def _get_destination_client_initial_config( "Please provide `destination` argument to `pipeline`, `run` or `load` method" " directly or via .dlt config.toml file or environment variable.", ) - # create initial destination client config client_spec = destination.spec - # initialize explicit credentials - if not as_staging: - # explicit credentials passed to dlt.pipeline should not be applied to staging - credentials = credentials or self.credentials - if credentials is not None and not isinstance(credentials, CredentialsConfiguration): - # use passed credentials as initial value. initial value may resolve credentials - credentials = initialize_credentials( - client_spec.get_resolvable_fields()["credentials"], credentials - ) - # this client support many schemas and datasets + # this client supports many schemas and datasets if issubclass(client_spec, DestinationClientDwhConfiguration): if not self.dataset_name and self.dev_mode: logger.warning( @@ -1190,18 +1171,13 @@ def _get_destination_client_initial_config( ) if issubclass(client_spec, DestinationClientStagingConfiguration): - spec: DestinationClientDwhConfiguration = client_spec( - credentials=credentials, - as_staging=as_staging, - ) + spec: DestinationClientDwhConfiguration = client_spec(as_staging=as_staging) else: - spec = client_spec( - credentials=credentials, - ) + spec = client_spec() spec._bind_dataset_name(self.dataset_name, default_schema_name) return spec - return client_spec(credentials=credentials) + return client_spec() def _get_destination_clients( self, @@ -1308,6 +1284,7 @@ def _set_destinations( staging: Optional[TDestinationReferenceArg] = None, staging_name: Optional[str] = None, initializing: bool = False, + destination_credentials: Any = None, ) -> None: destination_changed = destination is not None and destination != self.destination # set destination if provided but do not swap if factory is the same @@ -1349,6 +1326,9 @@ def _set_destinations( # set new context if not initializing: self._set_context(is_active=True) + # apply explicit credentials + if self.destination and destination_credentials: + self.destination.config_params["credentials"] = destination_credentials @contextmanager def _maybe_destination_capabilities( diff --git a/dlt/pipeline/warnings.py b/dlt/pipeline/warnings.py index 8bee670cb7..ac46a4eef0 100644 --- a/dlt/pipeline/warnings.py +++ b/dlt/pipeline/warnings.py @@ -5,23 +5,6 @@ from dlt.common.destination import Destination, TDestinationReferenceArg -def credentials_argument_deprecated( - caller_name: str, credentials: t.Optional[t.Any], destination: TDestinationReferenceArg = None -) -> None: - if credentials is None: - return - - dest_name = Destination.to_name(destination) if destination else "postgres" - - warnings.warn( - f"The `credentials argument` to {caller_name} is deprecated and will be removed in a future" - " version. Pass the same credentials to the `destination` instance instead, e.g." - f" {caller_name}(destination=dlt.destinations.{dest_name}(credentials=...))", - Dlt04DeprecationWarning, - stacklevel=2, - ) - - def full_refresh_argument_deprecated(caller_name: str, full_refresh: t.Optional[bool]) -> None: """full_refresh argument is replaced with dev_mode""" if full_refresh is None: diff --git a/docs/examples/archive/credentials/explicit.py b/docs/examples/archive/credentials/explicit.py index b1bc25fce6..f07c69360a 100644 --- a/docs/examples/archive/credentials/explicit.py +++ b/docs/examples/archive/credentials/explicit.py @@ -1,6 +1,7 @@ import os from typing import Iterator import dlt +from dlt.destinations import postgres @dlt.resource @@ -32,14 +33,14 @@ def simple_data( # you are free to pass credentials from custom location to destination pipeline = dlt.pipeline( - destination="postgres", credentials=dlt.secrets["custom.destination.credentials"] + destination=postgres(credentials=dlt.secrets["custom.destination.credentials"]) ) # see nice credentials object print(pipeline.credentials) # you can also pass credentials partially, only the password comes from the secrets or environment pipeline = dlt.pipeline( - destination="postgres", credentials="postgres://loader@localhost:5432/dlt_data" + destination=postgres(credentials="postgres://loader@localhost:5432/dlt_data") ) # now lets compare it with default location for config and credentials diff --git a/docs/examples/archive/quickstart.py b/docs/examples/archive/quickstart.py index f435fa3fab..6806c177ce 100644 --- a/docs/examples/archive/quickstart.py +++ b/docs/examples/archive/quickstart.py @@ -46,7 +46,6 @@ pipeline_name, destination=destination_name, dataset_name=dataset_name, - credentials=credentials, export_schema_path=export_schema_path, dev_mode=True, ) @@ -69,7 +68,9 @@ }, ] -load_info = pipeline.run(rows, table_name=table_name, write_disposition="replace") +load_info = pipeline.run( + rows, table_name=table_name, write_disposition="replace", credentials=credentials +) # 4. Optional error handling - print, raise or handle. print() diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 023f3e35bc..9ecd1ae6dc 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -164,8 +164,7 @@ destination.duckdb.credentials=":pipeline:" ```py p = pipeline_one = dlt.pipeline( pipeline_name="my_pipeline", - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(":pipeline:"), ) ``` diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 49437d7b74..26158177ff 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -198,7 +198,8 @@ def some_data(created_at=dlt.sources.incremental("created_at")): yield from source_items2 p = dlt.pipeline( - pipeline_name=uniq_id(), destination="duckdb", credentials=duckdb.connect(":memory:") + pipeline_name=uniq_id(), + destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")), ) p.run(some_data()).raise_on_failed_jobs() p.run(some_data()).raise_on_failed_jobs() @@ -238,7 +239,8 @@ def some_data(created_at=dlt.sources.incremental("created_at")): yield from source_items2 p = dlt.pipeline( - pipeline_name=uniq_id(), destination="duckdb", credentials=duckdb.connect(":memory:") + pipeline_name=uniq_id(), + destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")), ) p.run(some_data()).raise_on_failed_jobs() p.run(some_data()).raise_on_failed_jobs() @@ -444,7 +446,8 @@ def some_data(created_at=dlt.sources.incremental("created_at")): yield from source_items p = dlt.pipeline( - pipeline_name=uniq_id(), destination="duckdb", credentials=duckdb.connect(":memory:") + pipeline_name=uniq_id(), + destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")), ) p.run(some_data()).raise_on_failed_jobs() diff --git a/tests/helpers/airflow_tests/test_airflow_wrapper.py b/tests/helpers/airflow_tests/test_airflow_wrapper.py index 533d16c998..ac12f70037 100644 --- a/tests/helpers/airflow_tests/test_airflow_wrapper.py +++ b/tests/helpers/airflow_tests/test_airflow_wrapper.py @@ -150,8 +150,7 @@ def test_regular_run() -> None: pipeline_standalone = dlt.pipeline( pipeline_name="pipeline_standalone", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) pipeline_standalone.run(mock_data_source()) pipeline_standalone_counts = load_table_counts( @@ -170,8 +169,7 @@ def dag_regular(): pipeline_dag_regular = dlt.pipeline( pipeline_name="pipeline_dag_regular", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) tasks_list = tasks.add_run( pipeline_dag_regular, @@ -214,8 +212,7 @@ def dag_decomposed(): pipeline_dag_decomposed = dlt.pipeline( pipeline_name="pipeline_dag_decomposed", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=quackdb_path, + destination=dlt.destinations.duckdb(credentials=quackdb_path), ) tasks_list = tasks.add_run( pipeline_dag_decomposed, @@ -247,8 +244,7 @@ def test_run() -> None: pipeline_standalone = dlt.pipeline( pipeline_name="pipeline_standalone", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) pipeline_standalone.run(mock_data_source()) pipeline_standalone_counts = load_table_counts( @@ -268,8 +264,7 @@ def dag_regular(): pipeline_dag_regular = dlt.pipeline( pipeline_name="pipeline_dag_regular", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=quackdb_path, + destination=dlt.destinations.duckdb(credentials=quackdb_path), ) task = tasks.run(pipeline_dag_regular, mock_data_source()) @@ -292,8 +287,7 @@ def test_parallel_run(): pipeline_standalone = dlt.pipeline( pipeline_name="pipeline_parallel", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) pipeline_standalone.run(mock_data_source()) pipeline_standalone_counts = load_table_counts( @@ -315,8 +309,7 @@ def dag_parallel(): pipeline_dag_parallel = dlt.pipeline( pipeline_name="pipeline_dag_parallel", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=quackdb_path, + destination=dlt.destinations.duckdb(credentials=quackdb_path), ) tasks_list = tasks.add_run( pipeline_dag_parallel, @@ -349,8 +342,7 @@ def test_parallel_incremental(): pipeline_standalone = dlt.pipeline( pipeline_name="pipeline_parallel", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) pipeline_standalone.run(mock_data_incremental_source()) @@ -369,8 +361,7 @@ def dag_parallel(): pipeline_dag_parallel = dlt.pipeline( pipeline_name="pipeline_dag_parallel", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=quackdb_path, + destination=dlt.destinations.duckdb(credentials=quackdb_path), ) tasks.add_run( pipeline_dag_parallel, @@ -401,8 +392,7 @@ def test_parallel_isolated_run(): pipeline_standalone = dlt.pipeline( pipeline_name="pipeline_parallel", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) pipeline_standalone.run(mock_data_source()) pipeline_standalone_counts = load_table_counts( @@ -424,8 +414,7 @@ def dag_parallel(): pipeline_dag_parallel = dlt.pipeline( pipeline_name="pipeline_dag_parallel", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=quackdb_path, + destination=dlt.destinations.duckdb(credentials=quackdb_path), ) tasks_list = tasks.add_run( pipeline_dag_parallel, @@ -466,8 +455,7 @@ def test_parallel_run_single_resource(): pipeline_standalone = dlt.pipeline( pipeline_name="pipeline_parallel", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) pipeline_standalone.run(mock_data_single_resource()) pipeline_standalone_counts = load_table_counts( @@ -489,8 +477,7 @@ def dag_parallel(): pipeline_dag_parallel = dlt.pipeline( pipeline_name="pipeline_dag_parallel", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=quackdb_path, + destination=dlt.destinations.duckdb(credentials=quackdb_path), ) tasks_list = tasks.add_run( pipeline_dag_parallel, @@ -555,8 +542,7 @@ def dag_fail_3(): pipeline_fail_3 = dlt.pipeline( pipeline_name="pipeline_fail_3", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) tasks.add_run( pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True @@ -582,8 +568,7 @@ def dag_fail_4(): pipeline_fail_3 = dlt.pipeline( pipeline_name="pipeline_fail_3", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) tasks.add_run( pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True @@ -611,8 +596,7 @@ def dag_fail_5(): pipeline_fail_3 = dlt.pipeline( pipeline_name="pipeline_fail_3", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) tasks.add_run( pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True @@ -886,8 +870,9 @@ def dag_parallel(): pipe = dlt.pipeline( pipeline_name="test_pipeline", dataset_name="mock_data", - destination="duckdb", - credentials=os.path.join("_storage", "test_pipeline.duckdb"), + destination=dlt.destinations.duckdb( + credentials=os.path.join("_storage", "test_pipeline.duckdb") + ), ) task = tasks.add_run( pipe, @@ -954,8 +939,7 @@ def dag_regular(): call_dag = dlt.pipeline( pipeline_name="callable_dag", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=quackdb_path, + destination=dlt.destinations.duckdb(credentials=quackdb_path), ) tasks.run(call_dag, callable_source) @@ -991,8 +975,7 @@ def dag_regular(): call_dag = dlt.pipeline( pipeline_name="callable_dag", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=quackdb_path, + destination=dlt.destinations.duckdb(credentials=quackdb_path), ) tasks.run(call_dag, mock_data_source, on_before_run=on_before_run) diff --git a/tests/helpers/airflow_tests/test_join_airflow_scheduler.py b/tests/helpers/airflow_tests/test_join_airflow_scheduler.py index 8c1992c506..d737f254e3 100644 --- a/tests/helpers/airflow_tests/test_join_airflow_scheduler.py +++ b/tests/helpers/airflow_tests/test_join_airflow_scheduler.py @@ -292,8 +292,7 @@ def test_scheduler_pipeline_state() -> None: pipeline = dlt.pipeline( pipeline_name="pipeline_dag_regular", dataset_name="mock_data_" + uniq_id(), - destination="duckdb", - credentials=":pipeline:", + destination=dlt.destinations.duckdb(credentials=":pipeline:"), ) now = pendulum.now() diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index ebbe959874..f4088a7608 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -67,7 +67,9 @@ def test_duckdb_in_memory_mode_via_factory(): # Check if passing :memory: to factory fails with pytest.raises(PipelineStepFailed) as exc: - p = dlt.pipeline(pipeline_name="booboo", destination="duckdb", credentials=":memory:") + p = dlt.pipeline( + pipeline_name="booboo", destination=dlt.destinations.duckdb(credentials=":memory:") + ) p.run([1, 2, 3]) assert isinstance(exc.value.exception, InvalidInMemoryDuckdbCredentials) @@ -85,7 +87,7 @@ def test_duckdb_in_memory_mode_via_factory(): with pytest.raises(PipelineStepFailed) as exc: p = dlt.pipeline( pipeline_name="booboo", - destination=Destination.from_reference("duckdb", credentials=":memory:"), # type: ignore[arg-type] + destination=Destination.from_reference("duckdb", credentials=":memory:"), ) p.run([1, 2, 3], table_name="numbers") @@ -203,7 +205,9 @@ def test_duckdb_database_path() -> None: def test_keeps_initial_db_path() -> None: db_path = "_storage/path_test_quack.duckdb" - p = dlt.pipeline(pipeline_name="quack_pipeline", credentials=db_path, destination="duckdb") + p = dlt.pipeline( + pipeline_name="quack_pipeline", destination=dlt.destinations.duckdb(credentials=db_path) + ) print(p.pipelines_dir) with p.sql_client() as conn: # still cwd @@ -251,7 +255,7 @@ def test_duck_database_path_delete() -> None: db_folder = "_storage/db_path" os.makedirs(db_folder) db_path = f"{db_folder}/path_test_quack.duckdb" - p = dlt.pipeline(pipeline_name="deep_quack_pipeline", credentials=db_path, destination="duckdb") + p = dlt.pipeline(pipeline_name="deep_quack_pipeline", destination=duckdb(credentials=db_path)) p.run([1, 2, 3], table_name="table", dataset_name="dataset") # attach the pipeline p = dlt.attach(pipeline_name="deep_quack_pipeline") @@ -272,7 +276,7 @@ def test_case_sensitive_database_name() -> None: cs_quack = os.path.join(TEST_STORAGE_ROOT, "QuAcK") os.makedirs(cs_quack, exist_ok=True) db_path = os.path.join(cs_quack, "path_TEST_quack.duckdb") - p = dlt.pipeline(pipeline_name="NOT_QUAck", credentials=db_path, destination="duckdb") + p = dlt.pipeline(pipeline_name="NOT_QUAck", destination=duckdb(credentials=db_path)) with p.sql_client() as conn: conn.execute_sql("DESCRIBE;") diff --git a/tests/load/filesystem/test_aws_credentials.py b/tests/load/filesystem/test_aws_credentials.py index 5e0a3c3fd0..b782e76b7e 100644 --- a/tests/load/filesystem/test_aws_credentials.py +++ b/tests/load/filesystem/test_aws_credentials.py @@ -142,6 +142,24 @@ def test_aws_credentials_with_endpoint_url(environment: Dict[str, str]) -> None: } +def test_explicit_filesystem_credentials() -> None: + import dlt + from dlt.destinations import filesystem + + # try filesystem which uses union of credentials that requires bucket_url to resolve + p = dlt.pipeline( + pipeline_name="postgres_pipeline", + destination=filesystem( + bucket_url="s3://test", + destination_name="uniq_s3_bucket", + credentials={"aws_access_key_id": "key_id", "aws_secret_access_key": "key"}, + ), + ) + config = p.destination_client().config + assert isinstance(config.credentials, AwsCredentials) + assert config.credentials.is_resolved() + + def set_aws_credentials_env(environment: Dict[str, str]) -> None: environment["AWS_ACCESS_KEY_ID"] = "fake_access_key" environment["AWS_SECRET_ACCESS_KEY"] = "fake_secret_key" diff --git a/tests/load/filesystem/test_gcs_credentials.py b/tests/load/filesystem/test_gcs_credentials.py new file mode 100644 index 0000000000..febfa27ea4 --- /dev/null +++ b/tests/load/filesystem/test_gcs_credentials.py @@ -0,0 +1,32 @@ +import pytest + +import dlt +from dlt.destinations import filesystem +from dlt.sources.credentials import GcpOAuthCredentials +from tests.load.utils import ALL_FILESYSTEM_DRIVERS + +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + +if "gs" not in ALL_FILESYSTEM_DRIVERS: + pytest.skip("gcs filesystem driver not configured", allow_module_level=True) + + +def test_explicit_filesystem_credentials() -> None: + # resolve gcp oauth + p = dlt.pipeline( + pipeline_name="postgres_pipeline", + destination=filesystem( + "gcs://test", + destination_name="uniq_gcs_bucket", + credentials={ + "project_id": "pxid", + "refresh_token": "123token", + "client_id": "cid", + "client_secret": "s", + }, + ), + ) + config = p.destination_client().config + assert config.credentials.is_resolved() + assert isinstance(config.credentials, GcpOAuthCredentials) diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index a12c29168f..5b3c158b6c 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -13,12 +13,12 @@ from dlt.common.destination.reference import WithStagingDataset from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import PIPELINE_STATE_TABLE_NAME, VERSION_TABLE_NAME -from dlt.common.schema.utils import pipeline_state_table +from dlt.common.schema.typing import VERSION_TABLE_NAME from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id from dlt.destinations.exceptions import DatabaseUndefinedRelation +from dlt.destinations import filesystem, redshift from dlt.extract.exceptions import ResourceNameMissing from dlt.extract import DltSource from dlt.pipeline.exceptions import ( @@ -521,10 +521,16 @@ def test_dataset_name_change(destination_config: DestinationTestConfiguration) - def test_pipeline_explicit_destination_credentials( destination_config: DestinationTestConfiguration, ) -> None: + from dlt.destinations import postgres + from dlt.destinations.impl.postgres.configuration import PostgresCredentials + # explicit credentials resolved p = dlt.pipeline( - destination=Destination.from_reference("postgres", destination_name="mydest"), - credentials="postgresql://loader:loader@localhost:7777/dlt_data", + destination=Destination.from_reference( + "postgres", + destination_name="mydest", + credentials="postgresql://loader:loader@localhost:7777/dlt_data", + ), ) c = p._get_destination_clients(Schema("s"), p._get_destination_client_initial_config())[0] assert c.config.credentials.port == 7777 # type: ignore[attr-defined] @@ -533,8 +539,11 @@ def test_pipeline_explicit_destination_credentials( # explicit credentials resolved ignoring the config providers os.environ["DESTINATION__MYDEST__CREDENTIALS__HOST"] = "HOST" p = dlt.pipeline( - destination=Destination.from_reference("postgres", destination_name="mydest"), - credentials="postgresql://loader:loader@localhost:5432/dlt_data", + destination=Destination.from_reference( + "postgres", + destination_name="mydest", + credentials="postgresql://loader:loader@localhost:5432/dlt_data", + ), ) c = p._get_destination_clients(Schema("s"), p._get_destination_client_initial_config())[0] assert c.config.credentials.host == "localhost" # type: ignore[attr-defined] @@ -543,20 +552,35 @@ def test_pipeline_explicit_destination_credentials( os.environ["DESTINATION__MYDEST__CREDENTIALS__USERNAME"] = "UN" os.environ["DESTINATION__MYDEST__CREDENTIALS__PASSWORD"] = "PW" p = dlt.pipeline( - destination=Destination.from_reference("postgres", destination_name="mydest"), - credentials="postgresql://localhost:5432/dlt_data", + destination=Destination.from_reference( + "postgres", + destination_name="mydest", + credentials="postgresql://localhost:5432/dlt_data", + ), ) c = p._get_destination_clients(Schema("s"), p._get_destination_client_initial_config())[0] assert c.config.credentials.username == "UN" # type: ignore[attr-defined] - # host is also overridden - assert c.config.credentials.host == "HOST" # type: ignore[attr-defined] + # host is taken form explicit credentials + assert c.config.credentials.host == "localhost" # type: ignore[attr-defined] # instance of credentials will be simply passed - # c = RedshiftCredentials("postgresql://loader:loader@localhost/dlt_data") - # assert c.is_resolved() - # p = dlt.pipeline(destination="postgres", credentials=c) - # inner_c = p._get_destination_clients(Schema("s"), p._get_destination_client_initial_config())[0] - # assert inner_c is c + cred = PostgresCredentials("postgresql://user:pass@localhost/dlt_data") + p = dlt.pipeline(destination=postgres(credentials=cred)) + inner_c = p.destination_client() + assert inner_c.config.credentials is cred + + # with staging + p = dlt.pipeline( + pipeline_name="postgres_pipeline", + staging=filesystem("_storage"), + destination=redshift(credentials="redshift://loader:password@localhost:5432/dlt_data"), + ) + config = p.destination_client().config + assert config.credentials.is_resolved() + assert ( + config.credentials.to_native_representation() + == "redshift://loader:password@localhost:5432/dlt_data?connect_timeout=15" + ) # do not remove - it allows us to filter tests by destination @@ -703,9 +727,8 @@ def gen2(): # restore from destination, check state p = dlt.pipeline( pipeline_name="source_1_pipeline", - destination="duckdb", + destination=dlt.destinations.duckdb(credentials="duckdb:///_storage/test_quack.duckdb"), dataset_name="shared_dataset", - credentials="duckdb:///_storage/test_quack.duckdb", ) p.sync_destination() # we have our separate state @@ -720,9 +743,8 @@ def gen2(): p = dlt.pipeline( pipeline_name="source_2_pipeline", - destination="duckdb", + destination=dlt.destinations.duckdb(credentials="duckdb:///_storage/test_quack.duckdb"), dataset_name="shared_dataset", - credentials="duckdb:///_storage/test_quack.duckdb", ) p.sync_destination() # we have our separate state diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 95b97c7666..6a6bf4bde1 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -15,7 +15,7 @@ import dlt from dlt.common import json, pendulum from dlt.common.configuration.container import Container -from dlt.common.configuration.exceptions import ConfigFieldMissingException +from dlt.common.configuration.exceptions import ConfigFieldMissingException, InvalidNativeValue from dlt.common.configuration.specs.aws_credentials import AwsCredentials from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.configuration.specs.gcp_credentials import GcpOAuthCredentials @@ -267,49 +267,16 @@ def test_deterministic_salt(environment) -> None: def test_destination_explicit_credentials(environment: Any) -> None: + from dlt.destinations import motherduck + # test redshift p = dlt.pipeline( pipeline_name="postgres_pipeline", - destination="redshift", - credentials="redshift://loader:loader@localhost:5432/dlt_data", - ) - config = p._get_destination_client_initial_config() - assert config.credentials.is_resolved() - # with staging - p = dlt.pipeline( - pipeline_name="postgres_pipeline", - staging="filesystem", - destination="redshift", - credentials="redshift://loader:loader@localhost:5432/dlt_data", - ) - config = p._get_destination_client_initial_config(p.destination) - assert config.credentials.is_resolved() - config = p._get_destination_client_initial_config(p.staging, as_staging=True) - assert config.credentials is None - p._wipe_working_folder() - # try filesystem which uses union of credentials that requires bucket_url to resolve - p = dlt.pipeline( - pipeline_name="postgres_pipeline", - destination="filesystem", - credentials={"aws_access_key_id": "key_id", "aws_secret_access_key": "key"}, - ) - config = p._get_destination_client_initial_config(p.destination) - assert isinstance(config.credentials, AwsCredentials) - assert config.credentials.is_resolved() - # resolve gcp oauth - p = dlt.pipeline( - pipeline_name="postgres_pipeline", - destination="filesystem", - credentials={ - "project_id": "pxid", - "refresh_token": "123token", - "client_id": "cid", - "client_secret": "s", - }, + destination=motherduck(credentials="md://user:password@/dlt_data"), ) - config = p._get_destination_client_initial_config(p.destination) - assert isinstance(config.credentials, GcpOAuthCredentials) + config = p.destination_client().config assert config.credentials.is_resolved() + assert config.credentials.to_native_representation() == "md://user:password@/dlt_data" def test_destination_staging_config(environment: Any) -> None: @@ -362,14 +329,15 @@ def test_destination_credentials_in_factory(environment: Any) -> None: assert dest_config.credentials.database == "some_db" -@pytest.mark.skip(reason="does not work on CI. probably takes right credentials from somewhere....") def test_destination_explicit_invalid_credentials_filesystem(environment: Any) -> None: # if string cannot be parsed p = dlt.pipeline( - pipeline_name="postgres_pipeline", destination="filesystem", credentials="PR8BLEM" + pipeline_name="postgres_pipeline", + destination=filesystem(bucket_url="s3://test", destination_name="uniq_s3_bucket"), ) - with pytest.raises(NativeValueError): - p._get_destination_client_initial_config(p.destination) + with pytest.raises(PipelineStepFailed) as pip_ex: + p.run([1, 2, 3], table_name="data", credentials="PR8BLEM") + assert isinstance(pip_ex.value.__cause__, InvalidNativeValue) def test_extract_source_twice() -> None: diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py index a46529b861..4b46bb7c3e 100644 --- a/tests/pipeline/test_schema_contracts.py +++ b/tests/pipeline/test_schema_contracts.py @@ -177,8 +177,7 @@ def get_pipeline(): return dlt.pipeline( pipeline_name="contracts_" + uniq_id(), - destination="duckdb", - credentials=duckdb.connect(":memory:"), + destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")), dev_mode=True, ) From 1564dabb5f4eac1e0512be639f3358550fd96b53 Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Thu, 4 Jul 2024 20:05:09 +0530 Subject: [PATCH 50/61] Added Postgres replication verified sources docs (#1510) * Added pg_replication docs * Updated * Update docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md * Update docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md --------- Co-authored-by: Alena Astrakhantseva --- .../verified-sources/pg_replication.md | 271 ++++++++++++++++++ docs/website/sidebars.js | 1 + 2 files changed, 272 insertions(+) create mode 100644 docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md b/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md new file mode 100644 index 0000000000..6d69f09cd3 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/verified-sources/pg_replication.md @@ -0,0 +1,271 @@ +--- +title: Postgres replication +description: dlt verified source for Postgres replication +keywords: [postgres, postgres replication, database replication] +--- +import Header from './_source-info-header.md'; + +# Postgres replication + +
+ +[Postgres](https://www.postgresql.org/) is one of the most popular relational database management systems. This verified source uses Postgres replication functionality to efficiently process tables (a process often referred to as *Change Data Capture* or CDC). It uses [logical decoding](https://www.postgresql.org/docs/current/logicaldecoding.html) and the standard built-in `pgoutput` [output plugin](https://www.postgresql.org/docs/current/logicaldecoding-output-plugin.html). + +Resources that can be loaded using this verified source are: + +| Name | Description | +| -------------------- | ----------------------------------------------- | +| replication_resource | Load published messages from a replication slot | + +## Setup Guide + +### Setup user +To setup a Postgres user follow these steps: + +1. The Postgres user needs to have the `LOGIN` and `REPLICATION` attributes assigned: + + ```sql + CREATE ROLE replication_user WITH LOGIN REPLICATION; + ``` + +2. It also needs `GRANT` privilege on the database: + + ```sql + GRANT CREATE ON DATABASE dlt_data TO replication_user; + ``` + + +### Set up RDS +To setup a Postgres user on RDS follow these steps: + +1. You must enable replication for RDS Postgres instance via [Parameter Group](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_PostgreSQL.Replication.ReadReplicas.html) + +2. `WITH LOGIN REPLICATION;` does not work on RDS, instead do: + + ```sql + GRANT rds_replication TO replication_user; + ``` + +3. Do not fallback to non-SSL connection by setting connection parameters: + + ```toml + sources.pg_replication.credentials="postgresql://loader:password@host.rds.amazonaws.com:5432/dlt_data?sslmode=require&connect_timeout=300" + ``` +### Initialize the verified source + +To get started with your data pipeline, follow these steps: + +1. Enter the following command: + + ```sh + dlt init pg_replication duckdb + ``` + + It will initialize [the pipeline example](https://github.com/dlt-hub/verified-sources/blob/master/sources/pg_replication_pipeline.py) with a Postgres replication as the [source](https://dlthub.com/docs/general-usage/source) and [DuckDB](https://dlthub.com/docs/dlt-ecosystem/destinations/duckdb) as the [destination](https://dlthub.com/docs/dlt-ecosystem/destinations). + + +2. If you'd like to use a different destination, simply replace `duckdb` with the name of your preferred [destination](https://dlthub.com/docs/dlt-ecosystem/destinations). + +3. This source uses `sql_database` source, you can init it as follows: + + ```sh + dlt init sql_database duckdb + ``` + :::note + It is important to note that It is now only required if a user performs an initial load, specifically when `persist_snapshots` is set to `True`. + ::: + +4. After running these two commands, a new directory will be created with the necessary files and configuration settings to get started. + + For more information, read the guide on [how to add a verified source](https://dlthub.com/docs/walkthroughs/add-a-verified-source). + + :::note + You can omit the `[sql.sources.credentials]` section in `secrets.toml` as it is not required. + ::: + +### Add credentials + +1. In the `.dlt` folder, there's a file called `secrets.toml`. It's where you store sensitive information securely, like access tokens. Keep this file safe. + + Here's what the `secrets.toml` looks like: + + ```toml + [sources.pg_replication.credentials] + drivername = "postgresql" # please set me up! + database = "database" # please set me up! + password = "password" # please set me up! + username = "username" # please set me up! + host = "host" # please set me up! + port = 0 # please set me up! + ``` + +2. Credentials can be set as shown above. Alternatively, you can provide credentials in the `secrets.toml` file as follows: + + ```toml + sources.pg_replication.credentials="postgresql://username@password.host:port/database" + ``` + +3. Finally, follow the instructions in [Destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/) to add credentials for your chosen destination. This will ensure that your data is properly routed. + +For more information, read the [Configuration section.](https://dlthub.com/docs/general-usage/credentials) + +## Run the pipeline + +1. Before running the pipeline, ensure that you have installed all the necessary dependencies by running the command: + ```sh + pip install -r requirements.txt + ``` +2. You're now ready to run the pipeline! To get started, run the following command: + ```sh + python pg_replication_pipeline.py + ``` +3. Once the pipeline has finished running, you can verify that everything loaded correctly by using the following command: + ```sh + dlt pipeline show + ``` + For example, the `pipeline_name` for the above pipeline example is `pg_replication_pipeline`, you may also use any custom name instead. + + + For more information, read the guide on [how to run a pipeline](https://dlthub.com/docs/walkthroughs/run-a-pipeline). + + +## Sources and resources + +`dlt` works on the principle of [sources](https://dlthub.com/docs/general-usage/source) and [resources](https://dlthub.com/docs/general-usage/resource). + +### Resource `replication_resource` + +This resource yields data items for changes in one or more Postgres tables. + +```py +@dlt.resource( + name=lambda args: args["slot_name"] + "_" + args["pub_name"], + standalone=True, +) +def replication_resource( + slot_name: str, + pub_name: str, + credentials: ConnectionStringCredentials = dlt.secrets.value, + include_columns: Optional[Dict[str, Sequence[str]]] = None, + columns: Optional[Dict[str, TTableSchemaColumns]] = None, + target_batch_size: int = 1000, + flush_slot: bool = True, +) -> Iterable[Union[TDataItem, DataItemWithMeta]]: + ... +``` + +`slot_name`: Replication slot name to consume messages. + +`pub_name`: Publication slot name to publish messages. + +`include_columns`: Maps table name(s) to sequence of names of columns to include in the generated data items. Any column not in the sequence is excluded. If not provided, all columns are included + +`columns`: Maps table name(s) to column hints to apply on the replicated table(s) + +`target_batch_size`: Desired number of data items yielded in a batch. Can be used to limit the data items in memory. + +`flush_slot`: Whether processed messages are discarded from the replication slot. The recommended value is "True". + +## Customization + +If you wish to create your own pipelines, you can leverage source and resource methods from this verified source. + +1. Define the source pipeline as: + + ```py + # Defining source pipeline + src_pl = dlt.pipeline( + pipeline_name="source_pipeline", + destination="postgres", + dataset_name="source_dataset", + dev_mode=True, + ) + ``` + + You can configure and use the `get_postgres_pipeline()` function available in the `pg_replication_pipeline.py` file to achieve the same functionality. + + :::note IMPORTANT + When working with large datasets from a Postgres database, it's important to consider the relevance of the source pipeline. For testing purposes, using the source pipeline can be beneficial to try out the data flow. However, in production use cases, there will likely be another process that mutates the Postgres database. In such cases, the user generally only needs to define a destination pipeline. + ::: + + +2. Similarly, define the destination pipeline. + + ```py + dest_pl = dlt.pipeline( + pipeline_name="pg_replication_pipeline", + destination='duckdb', + dataset_name="replicate_single_table", + dev_mode=True, + ) + ``` + +3. Define the slot and publication names as: + + ```py + slot_name = "example_slot" + pub_name = "example_pub" + ``` + +4. To initialize replication, you can use the `init_replication` function. A user can use this function to let `dlt` configure Postgres and make it ready for replication. + + ```py + # requires the Postgres user to have the REPLICATION attribute assigned + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="my_source_table", + reset=True, + ) + ``` + + :::note + To replicate the entire schema, you can omit the `table_names` argument from the `init_replication` function. + ::: + +5. To snapshot the data to the destination during the initial load, you can use the `persist_snapshots=True` argument as follows: + ```py + snapshot = init_replication( # requires the Postgres user to have the REPLICATION attribute assigned + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="my_source_table", + persist_snapshots=True, # persist snapshot table(s) and let function return resource(s) for initial load + reset=True, + ) + ``` + +6. To load this snapshot to the destination, run the destination pipeline as: + + ```py + dest_pl.run(snapshot) + ``` + +7. After changes are made to the source, you can replicate the changes to the destination using the `replication_resource`, and run the pipeline as: + + ```py + # Create a resource that generates items for each change in the source table + changes = replication_resource(slot_name, pub_name) + + # Run the pipeline as + dest_pl.run(changes) + ``` + +8. To replicate tables with selected columns you can use the `include_columns` argument as follows: + + ```py + # requires the Postgres user to have the REPLICATION attribute assigned + initial_load = init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="my_source_table", + include_columns={ + "my_source_table": ("column1", "column2") + }, + reset=True, + ) + ``` + + Similarly, to replicate changes from selected columns, you can use the `table_names` and `include_columns` arguments in the `replication_resource` function. \ No newline at end of file diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 4fa1c58eae..e1773cb5b3 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -82,6 +82,7 @@ const sidebars = { 'dlt-ecosystem/verified-sources/mux', 'dlt-ecosystem/verified-sources/notion', 'dlt-ecosystem/verified-sources/personio', + 'dlt-ecosystem/verified-sources/pg_replication', 'dlt-ecosystem/verified-sources/pipedrive', 'dlt-ecosystem/verified-sources/rest_api', 'dlt-ecosystem/verified-sources/openapi-generator', From c3bcbb79e0e3fc029ae142fae6df0adc2f21c7cb Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 4 Jul 2024 17:42:21 +0200 Subject: [PATCH 51/61] Fix paginate import path in the tutorial (#1540) From e55bb0eef75f175dacab665351ff55187ede5fb8 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 4 Jul 2024 11:53:27 -0400 Subject: [PATCH 52/61] Extend orjson dependency allowed range with excluded versions (#1501) * Extend orjson dependency allowed range with excluded versions * bumps orjson in lock --------- Co-authored-by: Marcin Rudolf --- poetry.lock | 112 ++++++++++++++++++++++--------------------------- pyproject.toml | 3 +- 2 files changed, 51 insertions(+), 64 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2cef57424d..323b2188d3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5948,71 +5948,57 @@ dev = ["black", "mypy", "pytest"] [[package]] name = "orjson" -version = "3.9.5" +version = "3.10.5" description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "orjson-3.9.5-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:ad6845912a71adcc65df7c8a7f2155eba2096cf03ad2c061c93857de70d699ad"}, - {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e298e0aacfcc14ef4476c3f409e85475031de24e5b23605a465e9bf4b2156273"}, - {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83c9939073281ef7dd7c5ca7f54cceccb840b440cec4b8a326bda507ff88a0a6"}, - {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e174cc579904a48ee1ea3acb7045e8a6c5d52c17688dfcb00e0e842ec378cabf"}, - {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f8d51702f42c785b115401e1d64a27a2ea767ae7cf1fb8edaa09c7cf1571c660"}, - {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f13d61c0c7414ddee1ef4d0f303e2222f8cced5a2e26d9774751aecd72324c9e"}, - {file = "orjson-3.9.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d748cc48caf5a91c883d306ab648df1b29e16b488c9316852844dd0fd000d1c2"}, - {file = "orjson-3.9.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bd19bc08fa023e4c2cbf8294ad3f2b8922f4de9ba088dbc71e6b268fdf54591c"}, - {file = "orjson-3.9.5-cp310-none-win32.whl", hash = "sha256:5793a21a21bf34e1767e3d61a778a25feea8476dcc0bdf0ae1bc506dc34561ea"}, - {file = "orjson-3.9.5-cp310-none-win_amd64.whl", hash = "sha256:2bcec0b1024d0031ab3eab7a8cb260c8a4e4a5e35993878a2da639d69cdf6a65"}, - {file = "orjson-3.9.5-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8547b95ca0e2abd17e1471973e6d676f1d8acedd5f8fb4f739e0612651602d66"}, - {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87ce174d6a38d12b3327f76145acbd26f7bc808b2b458f61e94d83cd0ebb4d76"}, - {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a960bb1bc9a964d16fcc2d4af5a04ce5e4dfddca84e3060c35720d0a062064fe"}, - {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a7aa5573a949760d6161d826d34dc36db6011926f836851fe9ccb55b5a7d8e8"}, - {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b2852afca17d7eea85f8e200d324e38c851c96598ac7b227e4f6c4e59fbd3df"}, - {file = "orjson-3.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa185959c082475288da90f996a82e05e0c437216b96f2a8111caeb1d54ef926"}, - {file = "orjson-3.9.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:89c9332695b838438ea4b9a482bce8ffbfddde4df92750522d928fb00b7b8dce"}, - {file = "orjson-3.9.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2493f1351a8f0611bc26e2d3d407efb873032b4f6b8926fed8cfed39210ca4ba"}, - {file = "orjson-3.9.5-cp311-none-win32.whl", hash = "sha256:ffc544e0e24e9ae69301b9a79df87a971fa5d1c20a6b18dca885699709d01be0"}, - {file = "orjson-3.9.5-cp311-none-win_amd64.whl", hash = "sha256:89670fe2732e3c0c54406f77cad1765c4c582f67b915c74fda742286809a0cdc"}, - {file = "orjson-3.9.5-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:15df211469625fa27eced4aa08dc03e35f99c57d45a33855cc35f218ea4071b8"}, - {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9f17c59fe6c02bc5f89ad29edb0253d3059fe8ba64806d789af89a45c35269a"}, - {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca6b96659c7690773d8cebb6115c631f4a259a611788463e9c41e74fa53bf33f"}, - {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a26fafe966e9195b149950334bdbe9026eca17fe8ffe2d8fa87fdc30ca925d30"}, - {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9006b1eb645ecf460da067e2dd17768ccbb8f39b01815a571bfcfab7e8da5e52"}, - {file = "orjson-3.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebfdbf695734b1785e792a1315e41835ddf2a3e907ca0e1c87a53f23006ce01d"}, - {file = "orjson-3.9.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4a3943234342ab37d9ed78fb0a8f81cd4b9532f67bf2ac0d3aa45fa3f0a339f3"}, - {file = "orjson-3.9.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e6762755470b5c82f07b96b934af32e4d77395a11768b964aaa5eb092817bc31"}, - {file = "orjson-3.9.5-cp312-none-win_amd64.whl", hash = "sha256:c74df28749c076fd6e2157190df23d43d42b2c83e09d79b51694ee7315374ad5"}, - {file = "orjson-3.9.5-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:88e18a74d916b74f00d0978d84e365c6bf0e7ab846792efa15756b5fb2f7d49d"}, - {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d28514b5b6dfaf69097be70d0cf4f1407ec29d0f93e0b4131bf9cc8fd3f3e374"}, - {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25b81aca8c7be61e2566246b6a0ca49f8aece70dd3f38c7f5c837f398c4cb142"}, - {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:385c1c713b1e47fd92e96cf55fd88650ac6dfa0b997e8aa7ecffd8b5865078b1"}, - {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9850c03a8e42fba1a508466e6a0f99472fd2b4a5f30235ea49b2a1b32c04c11"}, - {file = "orjson-3.9.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4449f84bbb13bcef493d8aa669feadfced0f7c5eea2d0d88b5cc21f812183af8"}, - {file = "orjson-3.9.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:86127bf194f3b873135e44ce5dc9212cb152b7e06798d5667a898a00f0519be4"}, - {file = "orjson-3.9.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0abcd039f05ae9ab5b0ff11624d0b9e54376253b7d3217a358d09c3edf1d36f7"}, - {file = "orjson-3.9.5-cp37-none-win32.whl", hash = "sha256:10cc8ad5ff7188efcb4bec196009d61ce525a4e09488e6d5db41218c7fe4f001"}, - {file = "orjson-3.9.5-cp37-none-win_amd64.whl", hash = "sha256:ff27e98532cb87379d1a585837d59b187907228268e7b0a87abe122b2be6968e"}, - {file = "orjson-3.9.5-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:5bfa79916ef5fef75ad1f377e54a167f0de334c1fa4ebb8d0224075f3ec3d8c0"}, - {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e87dfa6ac0dae764371ab19b35eaaa46dfcb6ef2545dfca03064f21f5d08239f"}, - {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:50ced24a7b23058b469ecdb96e36607fc611cbaee38b58e62a55c80d1b3ad4e1"}, - {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1b74ea2a3064e1375da87788897935832e806cc784de3e789fd3c4ab8eb3fa5"}, - {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7cb961efe013606913d05609f014ad43edfaced82a576e8b520a5574ce3b2b9"}, - {file = "orjson-3.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1225d2d5ee76a786bda02f8c5e15017462f8432bb960de13d7c2619dba6f0275"}, - {file = "orjson-3.9.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f39f4b99199df05c7ecdd006086259ed25886cdbd7b14c8cdb10c7675cfcca7d"}, - {file = "orjson-3.9.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a461dc9fb60cac44f2d3218c36a0c1c01132314839a0e229d7fb1bba69b810d8"}, - {file = "orjson-3.9.5-cp38-none-win32.whl", hash = "sha256:dedf1a6173748202df223aea29de814b5836732a176b33501375c66f6ab7d822"}, - {file = "orjson-3.9.5-cp38-none-win_amd64.whl", hash = "sha256:fa504082f53efcbacb9087cc8676c163237beb6e999d43e72acb4bb6f0db11e6"}, - {file = "orjson-3.9.5-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6900f0248edc1bec2a2a3095a78a7e3ef4e63f60f8ddc583687eed162eedfd69"}, - {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17404333c40047888ac40bd8c4d49752a787e0a946e728a4e5723f111b6e55a5"}, - {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0eefb7cfdd9c2bc65f19f974a5d1dfecbac711dae91ed635820c6b12da7a3c11"}, - {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:68c78b2a3718892dc018adbc62e8bab6ef3c0d811816d21e6973dee0ca30c152"}, - {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:591ad7d9e4a9f9b104486ad5d88658c79ba29b66c5557ef9edf8ca877a3f8d11"}, - {file = "orjson-3.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6cc2cbf302fbb2d0b2c3c142a663d028873232a434d89ce1b2604ebe5cc93ce8"}, - {file = "orjson-3.9.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b26b5aa5e9ee1bad2795b925b3adb1b1b34122cb977f30d89e0a1b3f24d18450"}, - {file = "orjson-3.9.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ef84724f7d29dcfe3aafb1fc5fc7788dca63e8ae626bb9298022866146091a3e"}, - {file = "orjson-3.9.5-cp39-none-win32.whl", hash = "sha256:664cff27f85939059472afd39acff152fbac9a091b7137092cb651cf5f7747b5"}, - {file = "orjson-3.9.5-cp39-none-win_amd64.whl", hash = "sha256:91dda66755795ac6100e303e206b636568d42ac83c156547634256a2e68de694"}, - {file = "orjson-3.9.5.tar.gz", hash = "sha256:6daf5ee0b3cf530b9978cdbf71024f1c16ed4a67d05f6ec435c6e7fe7a52724c"}, + {file = "orjson-3.10.5-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:545d493c1f560d5ccfc134803ceb8955a14c3fcb47bbb4b2fee0232646d0b932"}, + {file = "orjson-3.10.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4324929c2dd917598212bfd554757feca3e5e0fa60da08be11b4aa8b90013c1"}, + {file = "orjson-3.10.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c13ca5e2ddded0ce6a927ea5a9f27cae77eee4c75547b4297252cb20c4d30e6"}, + {file = "orjson-3.10.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b6c8e30adfa52c025f042a87f450a6b9ea29649d828e0fec4858ed5e6caecf63"}, + {file = "orjson-3.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:338fd4f071b242f26e9ca802f443edc588fa4ab60bfa81f38beaedf42eda226c"}, + {file = "orjson-3.10.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6970ed7a3126cfed873c5d21ece1cd5d6f83ca6c9afb71bbae21a0b034588d96"}, + {file = "orjson-3.10.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:235dadefb793ad12f7fa11e98a480db1f7c6469ff9e3da5e73c7809c700d746b"}, + {file = "orjson-3.10.5-cp310-none-win32.whl", hash = "sha256:be79e2393679eda6a590638abda16d167754393f5d0850dcbca2d0c3735cebe2"}, + {file = "orjson-3.10.5-cp310-none-win_amd64.whl", hash = "sha256:c4a65310ccb5c9910c47b078ba78e2787cb3878cdded1702ac3d0da71ddc5228"}, + {file = "orjson-3.10.5-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:cdf7365063e80899ae3a697def1277c17a7df7ccfc979990a403dfe77bb54d40"}, + {file = "orjson-3.10.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b68742c469745d0e6ca5724506858f75e2f1e5b59a4315861f9e2b1df77775a"}, + {file = "orjson-3.10.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7d10cc1b594951522e35a3463da19e899abe6ca95f3c84c69e9e901e0bd93d38"}, + {file = "orjson-3.10.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dcbe82b35d1ac43b0d84072408330fd3295c2896973112d495e7234f7e3da2e1"}, + {file = "orjson-3.10.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c0eb7e0c75e1e486c7563fe231b40fdd658a035ae125c6ba651ca3b07936f5"}, + {file = "orjson-3.10.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:53ed1c879b10de56f35daf06dbc4a0d9a5db98f6ee853c2dbd3ee9d13e6f302f"}, + {file = "orjson-3.10.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:099e81a5975237fda3100f918839af95f42f981447ba8f47adb7b6a3cdb078fa"}, + {file = "orjson-3.10.5-cp311-none-win32.whl", hash = "sha256:1146bf85ea37ac421594107195db8bc77104f74bc83e8ee21a2e58596bfb2f04"}, + {file = "orjson-3.10.5-cp311-none-win_amd64.whl", hash = "sha256:36a10f43c5f3a55c2f680efe07aa93ef4a342d2960dd2b1b7ea2dd764fe4a37c"}, + {file = "orjson-3.10.5-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:68f85ecae7af14a585a563ac741b0547a3f291de81cd1e20903e79f25170458f"}, + {file = "orjson-3.10.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28afa96f496474ce60d3340fe8d9a263aa93ea01201cd2bad844c45cd21f5268"}, + {file = "orjson-3.10.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9cd684927af3e11b6e754df80b9ffafd9fb6adcaa9d3e8fdd5891be5a5cad51e"}, + {file = "orjson-3.10.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d21b9983da032505f7050795e98b5d9eee0df903258951566ecc358f6696969"}, + {file = "orjson-3.10.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ad1de7fef79736dde8c3554e75361ec351158a906d747bd901a52a5c9c8d24b"}, + {file = "orjson-3.10.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d97531cdfe9bdd76d492e69800afd97e5930cb0da6a825646667b2c6c6c0211"}, + {file = "orjson-3.10.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d69858c32f09c3e1ce44b617b3ebba1aba030e777000ebdf72b0d8e365d0b2b3"}, + {file = "orjson-3.10.5-cp312-none-win32.whl", hash = "sha256:64c9cc089f127e5875901ac05e5c25aa13cfa5dbbbd9602bda51e5c611d6e3e2"}, + {file = "orjson-3.10.5-cp312-none-win_amd64.whl", hash = "sha256:b2efbd67feff8c1f7728937c0d7f6ca8c25ec81373dc8db4ef394c1d93d13dc5"}, + {file = "orjson-3.10.5-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:03b565c3b93f5d6e001db48b747d31ea3819b89abf041ee10ac6988886d18e01"}, + {file = "orjson-3.10.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:584c902ec19ab7928fd5add1783c909094cc53f31ac7acfada817b0847975f26"}, + {file = "orjson-3.10.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5a35455cc0b0b3a1eaf67224035f5388591ec72b9b6136d66b49a553ce9eb1e6"}, + {file = "orjson-3.10.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1670fe88b116c2745a3a30b0f099b699a02bb3482c2591514baf5433819e4f4d"}, + {file = "orjson-3.10.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:185c394ef45b18b9a7d8e8f333606e2e8194a50c6e3c664215aae8cf42c5385e"}, + {file = "orjson-3.10.5-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ca0b3a94ac8d3886c9581b9f9de3ce858263865fdaa383fbc31c310b9eac07c9"}, + {file = "orjson-3.10.5-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:dfc91d4720d48e2a709e9c368d5125b4b5899dced34b5400c3837dadc7d6271b"}, + {file = "orjson-3.10.5-cp38-none-win32.whl", hash = "sha256:c05f16701ab2a4ca146d0bca950af254cb7c02f3c01fca8efbbad82d23b3d9d4"}, + {file = "orjson-3.10.5-cp38-none-win_amd64.whl", hash = "sha256:8a11d459338f96a9aa7f232ba95679fc0c7cedbd1b990d736467894210205c09"}, + {file = "orjson-3.10.5-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:85c89131d7b3218db1b24c4abecea92fd6c7f9fab87441cfc342d3acc725d807"}, + {file = "orjson-3.10.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb66215277a230c456f9038d5e2d84778141643207f85336ef8d2a9da26bd7ca"}, + {file = "orjson-3.10.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:51bbcdea96cdefa4a9b4461e690c75ad4e33796530d182bdd5c38980202c134a"}, + {file = "orjson-3.10.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbead71dbe65f959b7bd8cf91e0e11d5338033eba34c114f69078d59827ee139"}, + {file = "orjson-3.10.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5df58d206e78c40da118a8c14fc189207fffdcb1f21b3b4c9c0c18e839b5a214"}, + {file = "orjson-3.10.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c4057c3b511bb8aef605616bd3f1f002a697c7e4da6adf095ca5b84c0fd43595"}, + {file = "orjson-3.10.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b39e006b00c57125ab974362e740c14a0c6a66ff695bff44615dcf4a70ce2b86"}, + {file = "orjson-3.10.5-cp39-none-win32.whl", hash = "sha256:eded5138cc565a9d618e111c6d5c2547bbdd951114eb822f7f6309e04db0fb47"}, + {file = "orjson-3.10.5-cp39-none-win_amd64.whl", hash = "sha256:cc28e90a7cae7fcba2493953cff61da5a52950e78dc2dacfe931a317ee3d8de7"}, + {file = "orjson-3.10.5.tar.gz", hash = "sha256:7a5baef8a4284405d96c90c7c62b755e9ef1ada84c2406c24a9ebec86b89f46d"}, ] [[package]] @@ -9657,4 +9643,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "e517168f2ff67c46f3b37d7dcde88b73a1e2ae0d6890243b4c6d1e0aa504eff7" +content-hash = "bb75ee485742aa176ad726fd468832642096145fff0543472b998e04b8b053d0" diff --git a/pyproject.toml b/pyproject.toml index 849626314a..099850b6bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,8 @@ astunparse = ">=1.6.3" gitpython = ">=3.1.29" pytz = ">=2022.6" giturlparse = ">=0.10.0" -orjson = {version = ">=3.6.7,<=3.9.10", markers="platform_python_implementation != 'PyPy'"} +# exclude some versions because of segfault bugs in orjson +orjson = {version = ">=3.6.7,<4,!=3.9.11,!=3.9.12,!=3.9.13,!=3.9.14,!=3.10.1", markers="platform_python_implementation != 'PyPy'"} tenacity = ">=8.0.2" jsonpath-ng = ">=1.5.3" fsspec = ">=2022.4.0" From 48c93f5864d9d8b739bd11cd64811c9b0d42a461 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Thu, 4 Jul 2024 13:45:57 -0400 Subject: [PATCH 53/61] Fix/qdrant tests in CI (#1526) * Run qdrant server in local tests * Add qdrant to test destination configs * Fix stringify UUID objects * Install qdrant deps * Fix qdrant image version * Disable httpx logging in tests * Add index and use order by for fetching state * Try qdrant local support * Fix qdrant load stored state * Disable parallel load in qdrant local * Test destination config for qdrant local and server * Fixes * qdrant example test * Missing module * Cleanup * resolves configuration to get full capabilities in load * uses embedded qdrant for zendesk example --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_doc_snippets.yml | 2 + .github/workflows/test_local_destinations.yml | 10 +- dlt/common/destination/reference.py | 45 ++++++ dlt/destinations/impl/qdrant/configuration.py | 59 ++++++- dlt/destinations/impl/qdrant/exceptions.py | 11 ++ dlt/destinations/impl/qdrant/factory.py | 16 ++ dlt/destinations/impl/qdrant/qdrant_client.py | 122 ++++++++------ dlt/load/load.py | 4 +- dlt/pipeline/pipeline.py | 1 + .../examples/qdrant_zendesk/qdrant_zendesk.py | 39 ++--- docs/tools/prepare_examples_tests.py | 3 +- tests/conftest.py | 3 + tests/load/pipeline/test_arrow_loading.py | 2 +- tests/load/pipeline/test_restore_state.py | 152 ++++++++++-------- tests/load/qdrant/test_pipeline.py | 21 +++ tests/load/utils.py | 12 ++ 16 files changed, 348 insertions(+), 154 deletions(-) create mode 100644 dlt/destinations/impl/qdrant/exceptions.py diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index 6d4e6dda53..b140935d4c 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -21,6 +21,8 @@ env: # Slack hook for chess in production example RUNTIME__SLACK_INCOMING_HOOK: ${{ secrets.RUNTIME__SLACK_INCOMING_HOOK }} + # Path to local qdrant database + DESTINATION__QDRANT__CREDENTIALS__PATH: zendesk.qdb # detect if the workflow is executed in a repo fork IS_FORK: ${{ github.event.pull_request.head.repo.fork }} diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 263d3f588c..f1bf6016bc 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -21,7 +21,7 @@ env: RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 RUNTIME__LOG_LEVEL: ERROR RUNTIME__DLTHUB_TELEMETRY_ENDPOINT: ${{ secrets.RUNTIME__DLTHUB_TELEMETRY_ENDPOINT }} - ACTIVE_DESTINATIONS: "[\"duckdb\", \"postgres\", \"filesystem\", \"weaviate\"]" + ACTIVE_DESTINATIONS: "[\"duckdb\", \"postgres\", \"filesystem\", \"weaviate\", \"qdrant\"]" ALL_FILESYSTEM_DRIVERS: "[\"memory\", \"file\"]" DESTINATION__WEAVIATE__VECTORIZER: text2vec-contextionary @@ -63,6 +63,11 @@ jobs: --health-timeout 5s --health-retries 5 + qdrant: + image: qdrant/qdrant:v1.8.4 + ports: + - 6333:6333 + steps: - name: Check out uses: actions/checkout@master @@ -90,7 +95,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant --with sentry-sdk --with pipeline -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml @@ -100,6 +105,7 @@ jobs: name: Run tests Linux env: DESTINATION__POSTGRES__CREDENTIALS: postgresql://loader:loader@localhost:5432/dlt_data + DESTINATION__QDRANT__CREDENTIALS__location: http://localhost:6333 - name: Stop weaviate if: always() diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index a735aad5cf..259389a5e9 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -63,6 +63,28 @@ class StorageSchemaInfo(NamedTuple): inserted_at: datetime.datetime schema: str + @classmethod + def from_normalized_mapping( + cls, normalized_doc: Dict[str, Any], naming_convention: NamingConvention + ) -> "StorageSchemaInfo": + """Instantiate this class from mapping where keys are normalized according to given naming convention + + Args: + normalized_doc: Mapping with normalized keys (e.g. {Version: ..., SchemaName: ...}) + naming_convention: Naming convention that was used to normalize keys + + Returns: + StorageSchemaInfo: Instance of this class + """ + return cls( + version_hash=normalized_doc[naming_convention.normalize_identifier("version_hash")], + schema_name=normalized_doc[naming_convention.normalize_identifier("schema_name")], + version=normalized_doc[naming_convention.normalize_identifier("version")], + engine_version=normalized_doc[naming_convention.normalize_identifier("engine_version")], + inserted_at=normalized_doc[naming_convention.normalize_identifier("inserted_at")], + schema=normalized_doc[naming_convention.normalize_identifier("schema")], + ) + @dataclasses.dataclass class StateInfo: @@ -82,6 +104,29 @@ def as_doc(self) -> TPipelineStateDoc: doc.pop("version_hash") return doc + @classmethod + def from_normalized_mapping( + cls, normalized_doc: Dict[str, Any], naming_convention: NamingConvention + ) -> "StateInfo": + """Instantiate this class from mapping where keys are normalized according to given naming convention + + Args: + normalized_doc: Mapping with normalized keys (e.g. {Version: ..., PipelineName: ...}) + naming_convention: Naming convention that was used to normalize keys + + Returns: + StateInfo: Instance of this class + """ + return cls( + version=normalized_doc[naming_convention.normalize_identifier("version")], + engine_version=normalized_doc[naming_convention.normalize_identifier("engine_version")], + pipeline_name=normalized_doc[naming_convention.normalize_identifier("pipeline_name")], + state=normalized_doc[naming_convention.normalize_identifier("state")], + created_at=normalized_doc[naming_convention.normalize_identifier("created_at")], + version_hash=normalized_doc.get(naming_convention.normalize_identifier("version_hash")), + _dlt_load_id=normalized_doc.get(naming_convention.normalize_identifier("_dlt_load_id")), + ) + @configspec class DestinationClientConfiguration(BaseConfiguration): diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index 4d1ed1234d..baf5e5dc59 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -1,6 +1,6 @@ import dataclasses -from typing import Optional, Final -from typing_extensions import Annotated +from typing import Optional, Final, Any +from typing_extensions import Annotated, TYPE_CHECKING from dlt.common.configuration import configspec, NotResolved from dlt.common.configuration.specs.base_configuration import ( @@ -8,11 +8,17 @@ CredentialsConfiguration, ) from dlt.common.destination.reference import DestinationClientDwhConfiguration +from dlt.destinations.impl.qdrant.exceptions import InvalidInMemoryQdrantCredentials + +if TYPE_CHECKING: + from qdrant_client import QdrantClient @configspec class QdrantCredentials(CredentialsConfiguration): - # If `:memory:` - use in-memory Qdrant instance. + if TYPE_CHECKING: + _external_client: "QdrantClient" + # If `str` - use it as a `url` parameter. # If `None` - use default values for `host` and `port` location: Optional[str] = None @@ -21,6 +27,47 @@ class QdrantCredentials(CredentialsConfiguration): # Persistence path for QdrantLocal. Default: `None` path: Optional[str] = None + def is_local(self) -> bool: + return self.path is not None + + def on_resolved(self) -> None: + if self.location == ":memory:": + raise InvalidInMemoryQdrantCredentials() + + def parse_native_representation(self, native_value: Any) -> None: + try: + from qdrant_client import QdrantClient + + if isinstance(native_value, QdrantClient): + self._external_client = native_value + self.resolve() + except ModuleNotFoundError: + pass + + super().parse_native_representation(native_value) + + def _create_client(self, model: str, **options: Any) -> "QdrantClient": + from qdrant_client import QdrantClient + + creds = dict(self) + if creds["path"]: + del creds["location"] + + client = QdrantClient(**creds, **options) + client.set_model(model) + return client + + def get_client(self, model: str, **options: Any) -> "QdrantClient": + client = getattr(self, "_external_client", None) + return client or self._create_client(model, **options) + + def close_client(self, client: "QdrantClient") -> None: + """Close client if not external""" + if getattr(self, "_external_client", None) is client: + # Do not close client created externally + return + client.close() + def __str__(self) -> str: return self.location or "localhost" @@ -81,6 +128,12 @@ class QdrantClientConfiguration(DestinationClientDwhConfiguration): # Find the list here. https://qdrant.github.io/fastembed/examples/Supported_Models/. model: str = "BAAI/bge-small-en" + def get_client(self) -> "QdrantClient": + return self.credentials.get_client(self.model, **dict(self.options)) + + def close_client(self, client: "QdrantClient") -> None: + self.credentials.close_client(client) + def fingerprint(self) -> str: """Returns a fingerprint of a connection string""" diff --git a/dlt/destinations/impl/qdrant/exceptions.py b/dlt/destinations/impl/qdrant/exceptions.py new file mode 100644 index 0000000000..19f33f64c1 --- /dev/null +++ b/dlt/destinations/impl/qdrant/exceptions.py @@ -0,0 +1,11 @@ +from dlt.common.destination.exceptions import DestinationTerminalException + + +class InvalidInMemoryQdrantCredentials(DestinationTerminalException): + def __init__(self) -> None: + super().__init__( + "To use in-memory instance of qdrant, " + "please instantiate it first and then pass to destination factory\n" + '\nclient = QdrantClient(":memory:")\n' + 'dlt.pipeline(pipeline_name="...", destination=dlt.destinations.qdrant(client)' + ) diff --git a/dlt/destinations/impl/qdrant/factory.py b/dlt/destinations/impl/qdrant/factory.py index defd29a03a..2bface0938 100644 --- a/dlt/destinations/impl/qdrant/factory.py +++ b/dlt/destinations/impl/qdrant/factory.py @@ -1,6 +1,8 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.destination.reference import TDestinationConfig +from dlt.common.normalizers.naming import NamingConvention from dlt.destinations.impl.qdrant.configuration import QdrantCredentials, QdrantClientConfiguration @@ -26,6 +28,20 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: return caps + @classmethod + def adjust_capabilities( + cls, + caps: DestinationCapabilitiesContext, + config: QdrantClientConfiguration, + naming: t.Optional[NamingConvention], + ) -> DestinationCapabilitiesContext: + caps = super(qdrant, cls).adjust_capabilities(caps, config, naming) + if config.credentials.is_local(): + # Local qdrant can not load in parallel + caps.loader_parallelism_strategy = "sequential" + caps.max_parallel_load_jobs = 1 + return caps + @property def client_class(self) -> t.Type["QdrantClient"]: from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient diff --git a/dlt/destinations/impl/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py index 51915c5536..80c158d51a 100644 --- a/dlt/destinations/impl/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -1,5 +1,6 @@ from types import TracebackType from typing import Optional, Sequence, List, Dict, Type, Iterable, Any +import threading from dlt.common import logger from dlt.common.json import json @@ -13,6 +14,7 @@ ) from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import TLoadJobState, LoadJob, JobClientBase, WithStateSync +from dlt.common.destination.exceptions import DestinationUndefinedEntity from dlt.common.storages import FileStorage from dlt.common.time import precise_time @@ -46,6 +48,7 @@ def __init__( self.config = client_config with FileStorage.open_zipsafe_ro(local_path) as f: + ids: List[str] docs, payloads, ids = [], [], [] for line in f: @@ -53,7 +56,7 @@ def __init__( point_id = ( self._generate_uuid(data, self.unique_identifiers, self.collection_name) if self.unique_identifiers - else uuid.uuid4() + else str(uuid.uuid4()) ) payloads.append(data) ids.append(point_id) @@ -179,22 +182,6 @@ def dataset_name(self) -> str: def sentinel_collection(self) -> str: return self.dataset_name or "DltSentinelCollection" - @staticmethod - def _create_db_client(config: QdrantClientConfiguration) -> QC: - """Generates a Qdrant client from the 'qdrant_client' package. - - Args: - config (QdrantClientConfiguration): Credentials and options for the Qdrant client. - - Returns: - QdrantClient: A Qdrant client instance. - """ - credentials = dict(config.credentials) - options = dict(config.options) - client = QC(**credentials, **options) - client.set_model(config.model) - return client - def _make_qualified_collection_name(self, table_name: str) -> str: """Generates a qualified collection name. @@ -240,14 +227,11 @@ def _create_point_no_vector(self, obj: Dict[str, Any], collection_name: str) -> obj (Dict[str, Any]): The arbitrary data to be inserted as payload. collection_name (str): The name of the collection to insert the point into. """ - # we want decreased ids because the point scroll functions orders by id ASC - # so we want newest first - id_ = 2**64 - int(precise_time() * 10**6) self.db_client.upsert( collection_name, points=[ models.PointStruct( - id=id_, + id=str(uuid.uuid4()), payload=obj, vector={}, ) @@ -331,7 +315,7 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: p_load_id = self.schema.naming.normalize_identifier("load_id") p_dlt_load_id = self.schema.naming.normalize_identifier("_dlt_load_id") p_pipeline_name = self.schema.naming.normalize_identifier("pipeline_name") - # p_created_at = self.schema.naming.normalize_identifier("created_at") + p_created_at = self.schema.naming.normalize_identifier("created_at") limit = 100 offset = None @@ -350,15 +334,13 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: ) ] ), - # search by package load id which is guaranteed to increase over time - # order_by=models.OrderBy( - # key=p_created_at, - # # direction=models.Direction.DESC, - # ), + order_by=models.OrderBy( + key=p_created_at, + direction=models.Direction.DESC, + ), limit=limit, offset=offset, ) - # print("state_r", state_records) if len(state_records) == 0: return None for state_record in state_records: @@ -378,21 +360,24 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: ] ), ) - if load_records.count > 0: - return StateInfo(**state) - except Exception: - return None + if load_records.count == 0: + return None + return StateInfo.from_normalized_mapping(state, self.schema.naming) + except UnexpectedResponse as e: + if e.status_code == 404: + raise DestinationUndefinedEntity(str(e)) from e + raise + except ValueError as e: # Local qdrant error + if "not found" in str(e): + raise DestinationUndefinedEntity(str(e)) from e + raise def get_stored_schema(self) -> Optional[StorageSchemaInfo]: """Retrieves newest schema from destination storage""" try: scroll_table_name = self._make_qualified_collection_name(self.schema.version_table_name) p_schema_name = self.schema.naming.normalize_identifier("schema_name") - # this works only because we create points that have no vectors - # with decreasing ids. so newest (lowest ids) go first - # we do not use order_by because it requires and index to be created - # and this behavior is different for local and cloud qdrant - # p_inserted_at = self.schema.naming.normalize_identifier("inserted_at") + p_inserted_at = self.schema.naming.normalize_identifier("inserted_at") response = self.db_client.scroll( scroll_table_name, with_payload=True, @@ -405,15 +390,23 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]: ] ), limit=1, - # order_by=models.OrderBy( - # key=p_inserted_at, - # direction=models.Direction.DESC, - # ) + order_by=models.OrderBy( + key=p_inserted_at, + direction=models.Direction.DESC, + ), ) - record = response[0][0].payload - return StorageSchemaInfo(**record) - except Exception: - return None + if not response[0]: + return None + payload = response[0][0].payload + return StorageSchemaInfo.from_normalized_mapping(payload, self.schema.naming) + except UnexpectedResponse as e: + if e.status_code == 404: + raise DestinationUndefinedEntity(str(e)) from e + raise + except ValueError as e: # Local qdrant error + if "not found" in str(e): + raise DestinationUndefinedEntity(str(e)) from e + raise def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]: try: @@ -431,10 +424,18 @@ def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaI ), limit=1, ) - record = response[0][0].payload - return StorageSchemaInfo(**record) - except Exception: - return None + if not response[0]: + return None + payload = response[0][0].payload + return StorageSchemaInfo.from_normalized_mapping(payload, self.schema.naming) + except UnexpectedResponse as e: + if e.status_code == 404: + return None + raise + except ValueError as e: # Local qdrant error + if "not found" in str(e): + return None + raise def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: return LoadQdrantJob( @@ -456,7 +457,7 @@ def complete_load(self, load_id: str) -> None: self._create_point_no_vector(properties, loads_table_name) def __enter__(self) -> "QdrantClient": - self.db_client = QdrantClient._create_db_client(self.config) + self.db_client = self.config.get_client() return self def __exit__( @@ -466,7 +467,7 @@ def __exit__( exc_tb: TracebackType, ) -> None: if self.db_client: - self.db_client.close() + self.config.close_client(self.db_client) self.db_client = None def _update_schema_in_storage(self, schema: Schema) -> None: @@ -485,13 +486,30 @@ def _update_schema_in_storage(self, schema: Schema) -> None: self._create_point_no_vector(properties, version_table_name) def _execute_schema_update(self, only_tables: Iterable[str]) -> None: + is_local = self.config.credentials.is_local() for table_name in only_tables or self.schema.tables: exists = self._collection_exists(table_name) + qualified_collection_name = self._make_qualified_collection_name(table_name) if not exists: self._create_collection( - full_collection_name=self._make_qualified_collection_name(table_name) + full_collection_name=qualified_collection_name, ) + if not is_local: # Indexes don't work in local Qdrant (trigger log warning) + # Create indexes to enable order_by in state and schema tables + if table_name == self.schema.state_table_name: + self.db_client.create_payload_index( + collection_name=qualified_collection_name, + field_name=self.schema.naming.normalize_identifier("created_at"), + field_schema="datetime", + ) + elif table_name == self.schema.version_table_name: + self.db_client.create_payload_index( + collection_name=qualified_collection_name, + field_name=self.schema.naming.normalize_identifier("inserted_at"), + field_schema="datetime", + ) + self._update_schema_in_storage(self.schema) def _collection_exists(self, table_name: str, qualify_table_name: bool = True) -> bool: diff --git a/dlt/load/load.py b/dlt/load/load.py index 9d1d953f7f..76b4806694 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -196,7 +196,9 @@ def w_spool_job( def spool_new_jobs(self, load_id: str, schema: Schema) -> Tuple[int, List[LoadJob]]: # use thread based pool as jobs processing is mostly I/O and we do not want to pickle jobs load_files = filter_new_jobs( - self.load_storage.list_new_jobs(load_id), self.destination.capabilities(), self.config + self.load_storage.list_new_jobs(load_id), + self.destination.capabilities(self.destination.configuration(self.initial_client_config)), + self.config, ) file_count = len(load_files) if file_count == 0: diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index e4a7c7c4a8..2bfee3fd29 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -723,6 +723,7 @@ def sync_destination( try: try: restored_schemas: Sequence[Schema] = None + remote_state = self._restore_state_from_destination() # if remote state is newer or same diff --git a/docs/examples/qdrant_zendesk/qdrant_zendesk.py b/docs/examples/qdrant_zendesk/qdrant_zendesk.py index 7fb55fe842..5416f2f2d0 100644 --- a/docs/examples/qdrant_zendesk/qdrant_zendesk.py +++ b/docs/examples/qdrant_zendesk/qdrant_zendesk.py @@ -38,8 +38,6 @@ from dlt.destinations.adapters import qdrant_adapter from qdrant_client import QdrantClient -from dlt.common.configuration.inject import with_config - # function from: https://github.com/dlt-hub/verified-sources/tree/master/sources/zendesk @dlt.source(max_table_nesting=2) @@ -181,29 +179,22 @@ def get_pages( # make sure nothing failed load_info.raise_on_failed_jobs() - # running the Qdrant client to connect to your Qdrant database - - @with_config(sections=("destination", "qdrant", "credentials")) - def get_qdrant_client(location=dlt.secrets.value, api_key=dlt.secrets.value): - return QdrantClient( - url=location, - api_key=api_key, - ) - - # running the Qdrant client to connect to your Qdrant database - qdrant_client = get_qdrant_client() + # getting the authenticated Qdrant client to connect to your Qdrant database + with pipeline.destination_client() as destination_client: + from qdrant_client import QdrantClient - # view Qdrant collections you'll find your dataset here: - print(qdrant_client.get_collections()) + qdrant_client: QdrantClient = destination_client.db_client # type: ignore + # view Qdrant collections you'll find your dataset here: + print(qdrant_client.get_collections()) - # query Qdrant with prompt: getting tickets info close to "cancellation" - response = qdrant_client.query( - "zendesk_data_content", # collection/dataset name with the 'content' suffix -> tickets content table - query_text=["cancel", "cancel subscription"], # prompt to search - limit=3, # limit the number of results to the nearest 3 embeddings - ) + # query Qdrant with prompt: getting tickets info close to "cancellation" + response = qdrant_client.query( + "zendesk_data_content", # collection/dataset name with the 'content' suffix -> tickets content table + query_text="cancel subscription", # prompt to search + limit=3, # limit the number of results to the nearest 3 embeddings + ) - assert len(response) <= 3 and len(response) > 0 + assert len(response) <= 3 and len(response) > 0 - # make sure nothing failed - load_info.raise_on_failed_jobs() + # make sure nothing failed + load_info.raise_on_failed_jobs() diff --git a/docs/tools/prepare_examples_tests.py b/docs/tools/prepare_examples_tests.py index d39d311a50..dc0a3c82f9 100644 --- a/docs/tools/prepare_examples_tests.py +++ b/docs/tools/prepare_examples_tests.py @@ -3,6 +3,7 @@ """ import os import argparse +from typing import List import dlt.cli.echo as fmt @@ -10,7 +11,7 @@ # settings SKIP_FOLDERS = ["archive", ".", "_", "local_cache"] -SKIP_EXAMPLES = ["qdrant_zendesk"] +SKIP_EXAMPLES: List[str] = [] # the entry point for the script MAIN_CLAUSE = 'if __name__ == "__main__":' diff --git a/tests/conftest.py b/tests/conftest.py index 020487d878..669fd19c35 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -111,3 +111,6 @@ def _create_pipeline_instance_id(self) -> str: # disable databricks logging for log in ["databricks.sql.client"]: logging.getLogger(log).setLevel("WARNING") + + # disable httpx request logging (too verbose when testing qdrant) + logging.getLogger("httpx").setLevel("WARNING") diff --git a/tests/load/pipeline/test_arrow_loading.py b/tests/load/pipeline/test_arrow_loading.py index 630d84a28c..6d78968996 100644 --- a/tests/load/pipeline/test_arrow_loading.py +++ b/tests/load/pipeline/test_arrow_loading.py @@ -192,7 +192,7 @@ def test_parquet_column_names_are_normalized( def some_data(): yield tbl - pipeline = dlt.pipeline("arrow_" + uniq_id(), destination=destination_config.destination) + pipeline = destination_config.setup_pipeline("arrow_" + uniq_id()) pipeline.extract(some_data()) # Find the extracted file diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index 37f999ff86..d263f165b7 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -65,9 +65,10 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - with p.destination_client(p.default_schema.name) as job_client: # type: ignore[assignment] with pytest.raises(DestinationUndefinedEntity): load_pipeline_state_from_destination(p.pipeline_name, job_client) - # sync the schema - p.sync_schema() - # check if schema exists + # sync the schema + p.sync_schema() + # check if schema exists + with p.destination_client(p.default_schema.name) as job_client: # type: ignore[assignment] stored_schema = job_client.get_stored_schema() assert stored_schema is not None # dataset exists, still no table @@ -93,77 +94,87 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - # so dlt in normalize stage infers _state_version table again but with different column order and the column order in schema is different # then in database. parquet is created in schema order and in Redshift it must exactly match the order. # schema.bump_version() - p.sync_schema() + p.sync_schema() + with p.destination_client(p.default_schema.name) as job_client: # type: ignore[assignment] stored_schema = job_client.get_stored_schema() assert stored_schema is not None # table is there but no state assert load_pipeline_state_from_destination(p.pipeline_name, job_client) is None - # extract state - with p.managed_state(extract_state=True): - pass - # just run the existing extract - p.normalize(loader_file_format=destination_config.file_format) - p.load() + + # extract state + with p.managed_state(extract_state=True): + pass + # just run the existing extract + p.normalize(loader_file_format=destination_config.file_format) + p.load() + + with p.destination_client(p.default_schema.name) as job_client: # type: ignore[assignment] stored_state = load_pipeline_state_from_destination(p.pipeline_name, job_client) - local_state = p._get_state() - local_state.pop("_local") - assert stored_state == local_state - # extract state again - with p.managed_state(extract_state=True) as managed_state: - # this will be saved - managed_state["sources"] = {"source": dict(JSON_TYPED_DICT_DECODED)} - p.normalize(loader_file_format=destination_config.file_format) - p.load() + local_state = p._get_state() + local_state.pop("_local") + assert stored_state == local_state + # extract state again + with p.managed_state(extract_state=True) as managed_state: + # this will be saved + managed_state["sources"] = {"source": dict(JSON_TYPED_DICT_DECODED)} + p.normalize(loader_file_format=destination_config.file_format) + p.load() + + with p.destination_client(p.default_schema.name) as job_client: # type: ignore[assignment] stored_state = load_pipeline_state_from_destination(p.pipeline_name, job_client) - assert stored_state["sources"] == {"source": JSON_TYPED_DICT_DECODED} - local_state = p._get_state() - local_state.pop("_local") - assert stored_state == local_state - # use the state context manager again but do not change state - with p.managed_state(extract_state=True): - pass - # version not changed - new_local_state = p._get_state() - new_local_state.pop("_local") - assert local_state == new_local_state - p.normalize(loader_file_format=destination_config.file_format) - info = p.load() - assert len(info.loads_ids) == 0 + assert stored_state["sources"] == {"source": JSON_TYPED_DICT_DECODED} + local_state = p._get_state() + local_state.pop("_local") + assert stored_state == local_state + # use the state context manager again but do not change state + with p.managed_state(extract_state=True): + pass + # version not changed + new_local_state = p._get_state() + new_local_state.pop("_local") + assert local_state == new_local_state + p.normalize(loader_file_format=destination_config.file_format) + info = p.load() + assert len(info.loads_ids) == 0 + + with p.destination_client(p.default_schema.name) as job_client: # type: ignore[assignment] new_stored_state = load_pipeline_state_from_destination(p.pipeline_name, job_client) - # new state should not be stored - assert new_stored_state == stored_state - - # change the state in context manager but there's no extract - with p.managed_state(extract_state=False) as managed_state: - managed_state["sources"] = {"source": "test2"} # type: ignore[dict-item] - new_local_state = p._get_state() - new_local_state_local = new_local_state.pop("_local") - assert local_state != new_local_state - # version increased - assert local_state["_state_version"] + 1 == new_local_state["_state_version"] - # last extracted hash does not match current version hash - assert new_local_state_local["_last_extracted_hash"] != new_local_state["_version_hash"] - - # use the state context manager again but do not change state - # because _last_extracted_hash is not present (or different), the version will not change but state will be extracted anyway - with p.managed_state(extract_state=True): - pass - new_local_state_2 = p._get_state() - new_local_state_2_local = new_local_state_2.pop("_local") - assert new_local_state == new_local_state_2 - # there's extraction timestamp - assert "_last_extracted_at" in new_local_state_2_local - # and extract hash is == hash - assert new_local_state_2_local["_last_extracted_hash"] == new_local_state_2["_version_hash"] - # but the version didn't change - assert new_local_state["_state_version"] == new_local_state_2["_state_version"] - p.normalize(loader_file_format=destination_config.file_format) - info = p.load() - assert len(info.loads_ids) == 1 + # new state should not be stored + assert new_stored_state == stored_state + + # change the state in context manager but there's no extract + with p.managed_state(extract_state=False) as managed_state: + managed_state["sources"] = {"source": "test2"} # type: ignore[dict-item] + new_local_state = p._get_state() + new_local_state_local = new_local_state.pop("_local") + assert local_state != new_local_state + # version increased + assert local_state["_state_version"] + 1 == new_local_state["_state_version"] + # last extracted hash does not match current version hash + assert new_local_state_local["_last_extracted_hash"] != new_local_state["_version_hash"] + + # use the state context manager again but do not change state + # because _last_extracted_hash is not present (or different), the version will not change but state will be extracted anyway + with p.managed_state(extract_state=True): + pass + new_local_state_2 = p._get_state() + new_local_state_2_local = new_local_state_2.pop("_local") + assert new_local_state == new_local_state_2 + # there's extraction timestamp + assert "_last_extracted_at" in new_local_state_2_local + # and extract hash is == hash + assert new_local_state_2_local["_last_extracted_hash"] == new_local_state_2["_version_hash"] + # but the version didn't change + assert new_local_state["_state_version"] == new_local_state_2["_state_version"] + p.normalize(loader_file_format=destination_config.file_format) + info = p.load() + assert len(info.loads_ids) == 1 + + with p.destination_client(p.default_schema.name) as job_client: # type: ignore[assignment] new_stored_state_2 = load_pipeline_state_from_destination(p.pipeline_name, job_client) - # the stored state changed to next version - assert new_stored_state != new_stored_state_2 - assert new_stored_state["_state_version"] + 1 == new_stored_state_2["_state_version"] + # the stored state changed to next version + assert new_stored_state != new_stored_state_2 + assert new_stored_state["_state_version"] + 1 == new_stored_state_2["_state_version"] @pytest.mark.parametrize( @@ -224,9 +235,10 @@ def _make_dn_name(schema_name: str) -> str: default_schema = Schema("state") p._inject_schema(default_schema) + + # just sync schema without name - will use default schema + p.sync_schema() with p.destination_client() as job_client: - # just sync schema without name - will use default schema - p.sync_schema() assert get_normalized_dataset_name( job_client ) == default_schema.naming.normalize_table_identifier(dataset_name) @@ -242,9 +254,9 @@ def _make_dn_name(schema_name: str) -> str: ) == schema_two.naming.normalize_table_identifier(_make_dn_name("two")) schema_three = Schema("three") p._inject_schema(schema_three) + # sync schema with a name + p.sync_schema(schema_three.name) with p._get_destination_clients(schema_three)[0] as job_client: - # sync schema with a name - p.sync_schema(schema_three.name) assert get_normalized_dataset_name( job_client ) == schema_three.naming.normalize_table_identifier(_make_dn_name("three")) diff --git a/tests/load/qdrant/test_pipeline.py b/tests/load/qdrant/test_pipeline.py index e0cb9dab84..a33ecd2a8d 100644 --- a/tests/load/qdrant/test_pipeline.py +++ b/tests/load/qdrant/test_pipeline.py @@ -1,9 +1,12 @@ import pytest from typing import Iterator +from tempfile import TemporaryDirectory +import os import dlt from dlt.common import json from dlt.common.utils import uniq_id +from dlt.common.typing import DictStrStr from dlt.destinations.adapters import qdrant_adapter from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter, VECTORIZE_HINT @@ -11,6 +14,7 @@ from tests.pipeline.utils import assert_load_info from tests.load.qdrant.utils import drop_active_pipeline_data, assert_collection from tests.load.utils import sequence_generator +from tests.utils import preserve_environ # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -361,3 +365,20 @@ def test_empty_dataset_allowed() -> None: assert client.dataset_name is None assert client.sentinel_collection == "DltSentinelCollection" assert_collection(p, "content", expected_items_count=3) + + +def test_qdrant_local_parallelism_disabled(preserve_environ) -> None: + os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "20" + + with TemporaryDirectory() as tmpdir: + p = dlt.pipeline(destination=dlt.destinations.qdrant(path=tmpdir)) + + # Data writer limit ensures that we create multiple load files to the same table + @dlt.resource + def q_data(): + for i in range(222): + yield {"doc_id": i, "content": f"content {i}"} + + info = p.run(q_data) + + assert_load_info(info) diff --git a/tests/load/utils.py b/tests/load/utils.py index 00ed4e3bf3..9ee933a07a 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -13,6 +13,7 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.configuration.specs import CredentialsConfiguration from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, JobClientBase, @@ -129,6 +130,7 @@ class DestinationTestConfiguration: supports_dbt: bool = True disable_compression: bool = False dev_mode: bool = False + credentials: Optional[Union[CredentialsConfiguration, Dict[str, Any]]] = None @property def name(self) -> str: @@ -166,6 +168,10 @@ def setup(self) -> None: if self.destination == "filesystem" or self.disable_compression: os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" + if self.credentials is not None: + for key, value in dict(self.credentials).items(): + os.environ[f"DESTINATION__CREDENTIALS__{key.upper()}"] = str(value) + def setup_pipeline( self, pipeline_name: str, dataset_name: str = None, dev_mode: bool = False, **kwargs ) -> dlt.Pipeline: @@ -279,6 +285,12 @@ def destinations_configs( destination_configs += [ DestinationTestConfiguration(destination="weaviate"), DestinationTestConfiguration(destination="lancedb"), + DestinationTestConfiguration( + destination="qdrant", + credentials=dict(path=str(Path(FILE_BUCKET) / "qdrant_data")), + extra_info="local-file", + ), + DestinationTestConfiguration(destination="qdrant", extra_info="server"), ] if default_staging_configs or all_staging_configs: From 60c73273b362c7326f5897e59baea94031d5bdf3 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 5 Jul 2024 08:08:05 +0200 Subject: [PATCH 54/61] improves collision detection when naming convention changes (#1536) * adds more info to pipeline drop and info commands * extracts known env variables to separate module * drops tables on staging * tests create/drop datasets and tables * simplifies drop command and helpers + tests * adds no print linter module and a few other small fixes * improves collision detection when normalizers change * allows glob to work with memory filesystem * replaces walk in filesystem destination with own glob * standardizes drop_dataset beahvior for all destinations * creates athena iceberg tables in random locations --- dlt/cli/_dlt.py | 2 +- dlt/cli/pipeline_command.py | 44 +++- dlt/common/configuration/paths.py | 24 +- dlt/common/json/__init__.py | 5 +- dlt/common/known_env.py | 25 +++ dlt/common/normalizers/utils.py | 10 +- dlt/common/runners/stdout.py | 6 +- dlt/common/runtime/collector.py | 2 +- dlt/common/schema/schema.py | 136 +++++++++--- dlt/common/schema/utils.py | 23 +- dlt/common/storages/fsspec_filesystem.py | 2 +- dlt/common/storages/load_package.py | 14 +- dlt/common/utils.py | 2 +- dlt/destinations/impl/athena/athena.py | 10 +- dlt/destinations/impl/bigquery/bigquery.py | 10 +- dlt/destinations/impl/bigquery/sql_client.py | 29 +-- .../impl/clickhouse/sql_client.py | 26 ++- .../impl/databricks/sql_client.py | 3 - dlt/destinations/impl/destination/factory.py | 4 +- dlt/destinations/impl/dremio/sql_client.py | 10 +- .../impl/filesystem/filesystem.py | 28 +-- .../impl/lancedb/lancedb_client.py | 15 +- dlt/destinations/impl/mssql/sql_client.py | 2 +- dlt/destinations/impl/snowflake/snowflake.py | 6 +- dlt/destinations/impl/synapse/sql_client.py | 15 +- dlt/destinations/sql_client.py | 1 + dlt/extract/decorators.py | 23 +- dlt/extract/extract.py | 7 +- dlt/extract/incremental/__init__.py | 1 + dlt/extract/source.py | 45 ++-- dlt/helpers/airflow_helper.py | 7 +- dlt/helpers/dbt/runner.py | 5 +- dlt/load/load.py | 4 + dlt/load/utils.py | 19 +- dlt/normalize/worker.py | 1 + dlt/pipeline/drop.py | 48 ++-- dlt/pipeline/helpers.py | 49 ++-- dlt/pipeline/pipeline.py | 77 ++++--- dlt/sources/helpers/requests/retry.py | 4 +- poetry.lock | 17 +- pyproject.toml | 1 + tests/cli/common/test_cli_invoke.py | 7 +- .../configuration/test_toml_provider.py | 5 +- .../schema/test_normalize_identifiers.py | 4 +- tests/common/test_json.py | 5 +- tests/extract/test_sources.py | 33 +++ .../load/filesystem/test_filesystem_common.py | 57 +++-- tests/load/lancedb/test_pipeline.py | 4 +- tests/load/pipeline/test_bigquery.py | 2 +- .../load/pipeline/test_filesystem_pipeline.py | 10 +- tests/load/pipeline/test_refresh_modes.py | 90 +++++++- tests/load/test_job_client.py | 7 +- tests/load/test_sql_client.py | 22 ++ tests/load/utils.py | 3 +- tests/pipeline/test_dlt_versions.py | 52 ++++- tests/pipeline/test_drop_helpers.py | 209 ++++++++++++++++++ tests/pipeline/test_pipeline.py | 60 ++++- tests/pipeline/utils.py | 5 +- tox.ini | 3 + 59 files changed, 1010 insertions(+), 330 deletions(-) create mode 100644 dlt/common/known_env.py create mode 100644 tests/pipeline/test_drop_helpers.py diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index af4f2f66e9..7c6526c0a2 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -164,7 +164,7 @@ def schema_command_wrapper(file_path: str, format_: str, remove_defaults: bool) schema_str = json.dumps(s.to_dict(remove_defaults=remove_defaults), pretty=True) else: schema_str = s.to_pretty_yaml(remove_defaults=remove_defaults) - print(schema_str) + fmt.echo(schema_str) return 0 diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py index d66d884ff2..6aa479a398 100644 --- a/dlt/cli/pipeline_command.py +++ b/dlt/cli/pipeline_command.py @@ -8,7 +8,12 @@ from dlt.common.destination.reference import TDestinationReferenceArg from dlt.common.runners import Venv from dlt.common.runners.stdout import iter_stdout -from dlt.common.schema.utils import group_tables_by_resource, remove_defaults +from dlt.common.schema.utils import ( + group_tables_by_resource, + has_table_seen_data, + is_complete_column, + remove_defaults, +) from dlt.common.storages import FileStorage, PackageStorage from dlt.pipeline.helpers import DropCommand from dlt.pipeline.exceptions import CannotRestorePipelineException @@ -180,6 +185,35 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.bold(str(res_state_slots)), ) ) + if verbosity > 0: + for table in tables: + incomplete_columns = len( + [ + col + for col in table["columns"].values() + if not is_complete_column(col) + ] + ) + fmt.echo( + "\t%s table %s column(s) %s %s" + % ( + fmt.bold(table["name"]), + fmt.bold(str(len(table["columns"]))), + ( + fmt.style("received data", fg="green") + if has_table_seen_data(table) + else fmt.style("not yet received data", fg="yellow") + ), + ( + fmt.style( + f"{incomplete_columns} incomplete column(s)", + fg="yellow", + ) + if incomplete_columns > 0 + else "" + ), + ) + ) fmt.echo() fmt.echo("Working dir content:") _display_pending_packages() @@ -272,7 +306,7 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.echo(package_info.asstr(verbosity)) if len(package_info.schema_update) > 0: if verbosity == 0: - print("Add -v option to see schema update. Note that it could be large.") + fmt.echo("Add -v option to see schema update. Note that it could be large.") else: tables = remove_defaults({"tables": package_info.schema_update}) # type: ignore fmt.echo(fmt.bold("Schema update:")) @@ -316,7 +350,7 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.echo( "About to drop the following data in dataset %s in destination %s:" % ( - fmt.bold(drop.info["dataset_name"]), + fmt.bold(p.dataset_name), fmt.bold(p.destination.destination_name), ) ) @@ -329,6 +363,10 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: ) ) fmt.echo("%s: %s" % (fmt.style("Table(s) to drop", fg="green"), drop.info["tables"])) + fmt.echo( + "%s: %s" + % (fmt.style("\twith data in destination", fg="green"), drop.info["tables_with_data"]) + ) fmt.echo( "%s: %s" % ( diff --git a/dlt/common/configuration/paths.py b/dlt/common/configuration/paths.py index 89494ba6bd..9d0b47f8b6 100644 --- a/dlt/common/configuration/paths.py +++ b/dlt/common/configuration/paths.py @@ -1,16 +1,16 @@ import os import tempfile -# dlt settings folder -DOT_DLT = ".dlt" +from dlt.common import known_env + -# dlt data dir is by default not set, see get_dlt_data_dir for details -DLT_DATA_DIR: str = None +# dlt settings folder +DOT_DLT = os.environ.get(known_env.DLT_CONFIG_FOLDER, ".dlt") def get_dlt_project_dir() -> str: """The dlt project dir is the current working directory but may be overridden by DLT_PROJECT_DIR env variable.""" - return os.environ.get("DLT_PROJECT_DIR", ".") + return os.environ.get(known_env.DLT_PROJECT_DIR, ".") def get_dlt_settings_dir() -> str: @@ -27,14 +27,14 @@ def make_dlt_settings_path(path: str) -> str: def get_dlt_data_dir() -> str: - """Gets default directory where pipelines' data will be stored - 1. in user home directory: ~/.dlt/ - 2. if current user is root: in /var/dlt/ - 3. if current user does not have a home directory: in /tmp/dlt/ - 4. if DLT_DATA_DIR is set in env then it is used + """Gets default directory where pipelines' data (working directories) will be stored + 1. if DLT_DATA_DIR is set in env then it is used + 2. in user home directory: ~/.dlt/ + 3. if current user is root: in /var/dlt/ + 4. if current user does not have a home directory: in /tmp/dlt/ """ - if "DLT_DATA_DIR" in os.environ: - return os.environ["DLT_DATA_DIR"] + if known_env.DLT_DATA_DIR in os.environ: + return os.environ[known_env.DLT_DATA_DIR] # geteuid not available on Windows if hasattr(os, "geteuid") and os.geteuid() == 0: diff --git a/dlt/common/json/__init__.py b/dlt/common/json/__init__.py index cf68e5d3d4..00d8dcc430 100644 --- a/dlt/common/json/__init__.py +++ b/dlt/common/json/__init__.py @@ -12,6 +12,7 @@ except ImportError: PydanticBaseModel = None # type: ignore[misc] +from dlt.common import known_env from dlt.common.pendulum import pendulum from dlt.common.arithmetics import Decimal from dlt.common.wei import Wei @@ -80,7 +81,7 @@ def custom_encode(obj: Any) -> str: # use PUA range to encode additional types -PUA_START = int(os.environ.get("DLT_JSON_TYPED_PUA_START", "0xf026"), 16) +PUA_START = int(os.environ.get(known_env.DLT_JSON_TYPED_PUA_START, "0xf026"), 16) _DECIMAL = chr(PUA_START) _DATETIME = chr(PUA_START + 1) @@ -191,7 +192,7 @@ def may_have_pua(line: bytes) -> bool: # pick the right impl json: SupportsJson = None -if os.environ.get("DLT_USE_JSON") == "simplejson": +if os.environ.get(known_env.DLT_USE_JSON) == "simplejson": from dlt.common.json import _simplejson as _json_d json = _json_d # type: ignore[assignment] diff --git a/dlt/common/known_env.py b/dlt/common/known_env.py new file mode 100644 index 0000000000..7ac36d252d --- /dev/null +++ b/dlt/common/known_env.py @@ -0,0 +1,25 @@ +"""Defines env variables that `dlt` uses independently of its configuration system""" + +DLT_PROJECT_DIR = "DLT_PROJECT_DIR" +"""The dlt project dir is the current working directory, '.' (current working dir) by default""" + +DLT_DATA_DIR = "DLT_DATA_DIR" +"""Gets default directory where pipelines' data (working directories) will be stored""" + +DLT_CONFIG_FOLDER = "DLT_CONFIG_FOLDER" +"""A folder (path relative to DLT_PROJECT_DIR) where config and secrets are stored""" + +DLT_DEFAULT_NAMING_NAMESPACE = "DLT_DEFAULT_NAMING_NAMESPACE" +"""Python namespace default where naming modules reside, defaults to dlt.common.normalizers.naming""" + +DLT_DEFAULT_NAMING_MODULE = "DLT_DEFAULT_NAMING_MODULE" +"""A module name with the default naming convention, defaults to snake_case""" + +DLT_DLT_ID_LENGTH_BYTES = "DLT_DLT_ID_LENGTH_BYTES" +"""The length of the _dlt_id identifier, before base64 encoding""" + +DLT_USE_JSON = "DLT_USE_JSON" +"""Type of json parser to use, defaults to orjson, may be simplejson""" + +DLT_JSON_TYPED_PUA_START = "DLT_JSON_TYPED_PUA_START" +"""Start of the unicode block within the PUA used to encode types in typed json""" diff --git a/dlt/common/normalizers/utils.py b/dlt/common/normalizers/utils.py index beacf03e4e..d852cfb7d9 100644 --- a/dlt/common/normalizers/utils.py +++ b/dlt/common/normalizers/utils.py @@ -1,9 +1,11 @@ +import os from importlib import import_module from types import ModuleType from typing import Any, Dict, Optional, Type, Tuple, cast, List import dlt from dlt.common import logger +from dlt.common import known_env from dlt.common.configuration.inject import with_config from dlt.common.configuration.specs import known_sections from dlt.common.destination import DestinationCapabilitiesContext @@ -24,9 +26,11 @@ from dlt.common.typing import is_subclass from dlt.common.utils import get_full_class_name, uniq_id_base64, many_uniq_ids_base64 -DEFAULT_NAMING_NAMESPACE = "dlt.common.normalizers.naming" -DLT_ID_LENGTH_BYTES = 10 -DEFAULT_NAMING_MODULE = "snake_case" +DEFAULT_NAMING_NAMESPACE = os.environ.get( + known_env.DLT_DEFAULT_NAMING_NAMESPACE, "dlt.common.normalizers.naming" +) +DEFAULT_NAMING_MODULE = os.environ.get(known_env.DLT_DEFAULT_NAMING_MODULE, "snake_case") +DLT_ID_LENGTH_BYTES = int(os.environ.get(known_env.DLT_DLT_ID_LENGTH_BYTES, 10)) def _section_for_schema(kwargs: Dict[str, Any]) -> Tuple[str, ...]: diff --git a/dlt/common/runners/stdout.py b/dlt/common/runners/stdout.py index 6a92838342..bb5251764c 100644 --- a/dlt/common/runners/stdout.py +++ b/dlt/common/runners/stdout.py @@ -21,11 +21,11 @@ def exec_to_stdout(f: AnyFun) -> Iterator[Any]: rv = f() yield rv except Exception as ex: - print(encode_obj(ex), file=sys.stderr, flush=True) + print(encode_obj(ex), file=sys.stderr, flush=True) # noqa raise finally: if rv is not None: - print(encode_obj(rv), flush=True) + print(encode_obj(rv), flush=True) # noqa def iter_std( @@ -126,6 +126,6 @@ def iter_stdout_with_result( if isinstance(exception, Exception): raise exception from cpe else: - print(cpe.stderr, file=sys.stderr) + sys.stderr.write(cpe.stderr) # otherwise reraise cpe raise diff --git a/dlt/common/runtime/collector.py b/dlt/common/runtime/collector.py index e00bca576e..95117b70cc 100644 --- a/dlt/common/runtime/collector.py +++ b/dlt/common/runtime/collector.py @@ -230,7 +230,7 @@ def _log(self, log_level: int, log_message: str) -> None: if isinstance(self.logger, (logging.Logger, logging.LoggerAdapter)): self.logger.log(log_level, log_message) else: - print(log_message, file=self.logger or sys.stdout) + print(log_message, file=self.logger or sys.stdout) # noqa def _start(self, step: str) -> None: self.counters = defaultdict(int) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 52f8545587..9ef638e289 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -442,7 +442,7 @@ def drop_tables( """Drops tables from the schema and returns the dropped tables""" result = [] for table_name in table_names: - table = self.tables.get(table_name) + table = self.get_table(table_name) if table and (not seen_data_only or utils.has_table_seen_data(table)): result.append(self._schema_tables.pop(table_name)) return result @@ -555,9 +555,16 @@ def data_tables( ) ] - def data_table_names(self) -> List[str]: + def data_table_names( + self, seen_data_only: bool = False, include_incomplete: bool = False + ) -> List[str]: """Returns list of table table names. Excludes dlt table names.""" - return [t["name"] for t in self.data_tables()] + return [ + t["name"] + for t in self.data_tables( + seen_data_only=seen_data_only, include_incomplete=include_incomplete + ) + ] def dlt_tables(self) -> List[TTableSchema]: """Gets dlt tables""" @@ -728,6 +735,14 @@ def update_normalizers(self) -> None: self._configure_normalizers(explicit_normalizers(schema_name=self._schema_name)) self._compile_settings() + def will_update_normalizers(self) -> bool: + """Checks if schema has any pending normalizer updates due to configuration or destination capabilities""" + # import desired modules + _, to_naming, _ = import_normalizers( + explicit_normalizers(schema_name=self._schema_name), self._normalizers_config + ) + return type(to_naming) is not type(self.naming) # noqa + def set_schema_contract(self, settings: TSchemaContract) -> None: if not settings: self._settings.pop("schema_contract", None) @@ -967,42 +982,91 @@ def _verify_update_normalizers( from_naming: NamingConvention, ) -> TSchemaTables: """Verifies if normalizers can be updated before schema is changed""" - # print(f"{self.name}: {type(to_naming)} {type(naming_module)}") - if from_naming and type(from_naming) is not type(to_naming): + allow_ident_change = normalizers_config.get( + "allow_identifier_change_on_table_with_data", False + ) + + def _verify_identifiers(table: TTableSchema, norm_table: TTableSchema) -> None: + if not allow_ident_change: + # make sure no identifier got changed in table + if norm_table["name"] != table["name"]: + raise TableIdentifiersFrozen( + self.name, + table["name"], + to_naming, + from_naming, + f"Attempt to rename table name to {norm_table['name']}.", + ) + # if len(norm_table["columns"]) != len(table["columns"]): + # print(norm_table["columns"]) + # raise TableIdentifiersFrozen( + # self.name, + # table["name"], + # to_naming, + # from_naming, + # "Number of columns changed after normalization. Some columns must have" + # " merged.", + # ) + col_diff = set(norm_table["columns"].keys()).symmetric_difference( + table["columns"].keys() + ) + if len(col_diff) > 0: + raise TableIdentifiersFrozen( + self.name, + table["name"], + to_naming, + from_naming, + f"Some columns got renamed to {col_diff}.", + ) + + naming_changed = from_naming and type(from_naming) is not type(to_naming) + if naming_changed: schema_tables = {} - for table in self._schema_tables.values(): + # check dlt tables + schema_seen_data = any( + utils.has_table_seen_data(t) for t in self._schema_tables.values() + ) + # modify dlt tables using original naming + orig_dlt_tables = [ + (self.version_table_name, utils.version_table()), + (self.loads_table_name, utils.loads_table()), + (self.state_table_name, utils.pipeline_state_table(add_dlt_id=True)), + ] + for existing_table_name, original_table in orig_dlt_tables: + table = self._schema_tables.get(existing_table_name) + # state table is optional + if table: + table = copy(table) + # keep all attributes of the schema table, copy only what we need to normalize + table["columns"] = original_table["columns"] + norm_table = utils.normalize_table_identifiers(table, to_naming) + table_seen_data = utils.has_table_seen_data(norm_table) + if schema_seen_data: + _verify_identifiers(table, norm_table) + schema_tables[norm_table["name"]] = norm_table + + schema_seen_data = False + for table in self.data_tables(include_incomplete=True): + # TODO: when lineage is fully implemented we should use source identifiers + # not `table` which was already normalized norm_table = utils.normalize_table_identifiers(table, to_naming) - if utils.has_table_seen_data(norm_table) and not normalizers_config.get( - "allow_identifier_change_on_table_with_data", False - ): - # make sure no identifier got changed in table - if norm_table["name"] != table["name"]: - raise TableIdentifiersFrozen( - self.name, - table["name"], - to_naming, - from_naming, - f"Attempt to rename table name to {norm_table['name']}.", - ) - if len(norm_table["columns"]) != len(table["columns"]): - raise TableIdentifiersFrozen( - self.name, - table["name"], - to_naming, - from_naming, - "Number of columns changed after normalization. Some columns must have" - " merged.", - ) - col_diff = set(norm_table["columns"].keys()).difference(table["columns"].keys()) - if len(col_diff) > 0: - raise TableIdentifiersFrozen( - self.name, - table["name"], - to_naming, - from_naming, - f"Some columns got renamed to {col_diff}.", - ) + table_seen_data = utils.has_table_seen_data(norm_table) + if table_seen_data: + _verify_identifiers(table, norm_table) schema_tables[norm_table["name"]] = norm_table + schema_seen_data |= table_seen_data + if schema_seen_data and not allow_ident_change: + # if any of the tables has seen data, fail naming convention change + # NOTE: this will be dropped with full identifier lineage. currently we cannot detect + # strict schemas being changed to lax + raise TableIdentifiersFrozen( + self.name, + "-", + to_naming, + from_naming, + "Schema contains tables that received data. As a precaution changing naming" + " conventions is disallowed until full identifier lineage is implemented.", + ) # re-index the table names return schema_tables else: diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index f5765be351..cd0cc5aa63 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -802,23 +802,26 @@ def loads_table() -> TTableSchema: return table -def pipeline_state_table() -> TTableSchema: +def pipeline_state_table(add_dlt_id: bool = False) -> TTableSchema: # NOTE: always add new columns at the end of the table so we have identical layout # after an update of existing tables (always at the end) # set to nullable so we can migrate existing tables # WARNING: do not reorder the columns + columns: List[TColumnSchema] = [ + {"name": "version", "data_type": "bigint", "nullable": False}, + {"name": "engine_version", "data_type": "bigint", "nullable": False}, + {"name": "pipeline_name", "data_type": "text", "nullable": False}, + {"name": "state", "data_type": "text", "nullable": False}, + {"name": "created_at", "data_type": "timestamp", "nullable": False}, + {"name": "version_hash", "data_type": "text", "nullable": True}, + {"name": "_dlt_load_id", "data_type": "text", "nullable": False}, + ] + if add_dlt_id: + columns.append({"name": "_dlt_id", "data_type": "text", "nullable": False, "unique": True}) table = new_table( PIPELINE_STATE_TABLE_NAME, write_disposition="append", - columns=[ - {"name": "version", "data_type": "bigint", "nullable": False}, - {"name": "engine_version", "data_type": "bigint", "nullable": False}, - {"name": "pipeline_name", "data_type": "text", "nullable": False}, - {"name": "state", "data_type": "text", "nullable": False}, - {"name": "created_at", "data_type": "timestamp", "nullable": False}, - {"name": "version_hash", "data_type": "text", "nullable": True}, - {"name": "_dlt_load_id", "data_type": "text", "nullable": False}, - ], + columns=columns, # always use caps preferred file format for processing file_format="preferred", ) diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py index f419baed03..be9ae2bbb1 100644 --- a/dlt/common/storages/fsspec_filesystem.py +++ b/dlt/common/storages/fsspec_filesystem.py @@ -319,7 +319,7 @@ def glob_files( rel_path = pathlib.Path(file).relative_to(root_dir).as_posix() file_url = FilesystemConfiguration.make_file_uri(file) else: - rel_path = posixpath.relpath(file, root_dir) + rel_path = posixpath.relpath(file.lstrip("/"), root_dir) file_url = bucket_url_parsed._replace( path=posixpath.join(bucket_url_parsed.path, rel_path) ).geturl() diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index 9e3185221d..4d84094427 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -72,7 +72,14 @@ class TPipelineStateDoc(TypedDict, total=False): _dlt_load_id: NotRequired[str] -class TLoadPackageState(TVersionedState, total=False): +class TLoadPackageDropTablesState(TypedDict): + dropped_tables: NotRequired[List[TTableSchema]] + """List of tables that are to be dropped from the schema and destination (i.e. when `refresh` mode is used)""" + truncated_tables: NotRequired[List[TTableSchema]] + """List of tables that are to be truncated in the destination (i.e. when `refresh='drop_data'` mode is used)""" + + +class TLoadPackageState(TVersionedState, TLoadPackageDropTablesState, total=False): created_at: DateTime """Timestamp when the load package was created""" pipeline_state: NotRequired[TPipelineStateDoc] @@ -82,11 +89,6 @@ class TLoadPackageState(TVersionedState, total=False): destination_state: NotRequired[Dict[str, Any]] """private space for destinations to store state relevant only to the load package""" - dropped_tables: NotRequired[List[TTableSchema]] - """List of tables that are to be dropped from the schema and destination (i.e. when `refresh` mode is used)""" - truncated_tables: NotRequired[List[TTableSchema]] - """List of tables that are to be truncated in the destination (i.e. when `refresh='drop_data'` mode is used)""" - class TLoadPackage(TypedDict, total=False): load_id: str diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 8e89556c39..7109daf497 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -137,7 +137,7 @@ def flatten_list_of_str_or_dicts(seq: Sequence[Union[StrAny, str]]) -> DictStrAn else: key = str(e) if key in o: - raise KeyError(f"Cannot flatten with duplicate key {k}") + raise KeyError(f"Cannot flatten with duplicate key {key}") o[key] = None return o diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 8d0ffb1d0c..2b76ca782e 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -17,6 +17,7 @@ import re from contextlib import contextmanager +from fsspec import AbstractFileSystem from pendulum.datetime import DateTime, Date from datetime import datetime # noqa: I251 @@ -33,7 +34,8 @@ from dlt.common import logger from dlt.common.exceptions import TerminalValueError -from dlt.common.utils import without_none +from dlt.common.storages.fsspec_filesystem import fsspec_from_config +from dlt.common.utils import uniq_id, without_none from dlt.common.schema import TColumnSchema, Schema, TTableSchema from dlt.common.schema.typing import ( TTableSchema, @@ -425,8 +427,12 @@ def _get_table_update_sql( is_iceberg = self._is_iceberg_table(table) or table.get("write_disposition", None) == "skip" columns = ", ".join([self._get_column_def_sql(c, table_format) for c in new_columns]) + # create unique tag for iceberg table so it is never recreated in the same folder + # athena requires some kind of special cleaning (or that is a bug) so we cannot refresh + # iceberg tables without it + location_tag = uniq_id(6) if is_iceberg else "" # this will fail if the table prefix is not properly defined - table_prefix = self.table_prefix_layout.format(table_name=table_name) + table_prefix = self.table_prefix_layout.format(table_name=table_name + location_tag) location = f"{bucket}/{dataset}/{table_prefix}" # use qualified table names diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index c3a1be4174..d0052c22f0 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -20,7 +20,6 @@ SupportsStagingDestination, ) from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns -from dlt.common.schema.exceptions import UnknownTableException from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat from dlt.common.schema.utils import get_inherited_table_hint from dlt.common.schema.utils import table_schema_has_type @@ -29,9 +28,10 @@ from dlt.destinations.job_impl import DestinationJsonlLoadJob, DestinationParquetLoadJob from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.exceptions import ( + DatabaseTransientException, + DatabaseUndefinedRelation, DestinationSchemaWillNotUpdate, DestinationTerminalException, - DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException, ) @@ -226,7 +226,7 @@ def restore_file_load(self, file_path: str) -> LoadJob: file_path, f"The server reason was: {reason}" ) from gace else: - raise DestinationTransientException(gace) from gace + raise DatabaseTransientException(gace) from gace return job def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: @@ -271,7 +271,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> reason = BigQuerySqlClient._get_reason_from_errors(gace) if reason == "notFound": # google.api_core.exceptions.NotFound: 404 – table not found - raise UnknownTableException(self.schema.name, table["name"]) from gace + raise DatabaseUndefinedRelation(gace) from gace elif ( reason == "duplicate" ): # google.api_core.exceptions.Conflict: 409 PUT – already exists @@ -282,7 +282,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> file_path, f"The server reason was: {reason}" ) from gace else: - raise DestinationTransientException(gace) from gace + raise DatabaseTransientException(gace) from gace return job diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py index 45e9379af5..e6aee1fc43 100644 --- a/dlt/destinations/impl/bigquery/sql_client.py +++ b/dlt/destinations/impl/bigquery/sql_client.py @@ -193,20 +193,21 @@ def create_dataset(self) -> None: dataset = bigquery.Dataset(self.fully_qualified_dataset_name(escape=False)) dataset.location = self.location dataset.is_case_insensitive = not self.capabilities.has_case_sensitive_identifiers - self._client.create_dataset( - dataset, - retry=self._default_retry, - timeout=self.http_timeout, - ) - - def drop_dataset(self) -> None: - self._client.delete_dataset( - self.fully_qualified_dataset_name(escape=False), - not_found_ok=True, - delete_contents=True, - retry=self._default_retry, - timeout=self.http_timeout, - ) + try: + self._client.create_dataset( + dataset, + retry=self._default_retry, + timeout=self.http_timeout, + ) + except api_core_exceptions.GoogleAPICallError as gace: + reason = BigQuerySqlClient._get_reason_from_errors(gace) + if reason == "notFound": + # google.api_core.exceptions.NotFound: 404 – table not found + raise DatabaseUndefinedRelation(gace) from gace + elif reason in BQ_TERMINAL_REASONS: + raise DatabaseTerminalException(gace) from gace + else: + raise DatabaseTransientException(gace) from gace def execute_sql( self, sql: AnyStr, *args: Any, **kwargs: Any diff --git a/dlt/destinations/impl/clickhouse/sql_client.py b/dlt/destinations/impl/clickhouse/sql_client.py index ee013ea123..8544643017 100644 --- a/dlt/destinations/impl/clickhouse/sql_client.py +++ b/dlt/destinations/impl/clickhouse/sql_client.py @@ -107,16 +107,34 @@ def create_dataset(self) -> None: ) def drop_dataset(self) -> None: + # always try to drop sentinel table + sentinel_table_name = self.make_qualified_table_name( + self.credentials.dataset_sentinel_table_name + ) + # drop a sentinel table + self.execute_sql(f"DROP TABLE {sentinel_table_name} SYNC") + # Since ClickHouse doesn't have schemas, we need to drop all tables in our virtual schema, # or collection of tables, that has the `dataset_name` as a prefix. - to_drop_results = self._list_tables() + to_drop_results = [ + f"{self.catalog_name()}.{self.capabilities.escape_identifier(table)}" + for table in self._list_tables() + ] for table in to_drop_results: # The "DROP TABLE" clause is discarded if we allow clickhouse_driver to handle parameter substitution. # This is because the driver incorrectly substitutes the entire query string, causing the "DROP TABLE" keyword to be omitted. # To resolve this, we are forced to provide the full query string here. - self.execute_sql( - f"""DROP TABLE {self.catalog_name()}.{self.capabilities.escape_identifier(table)} SYNC""" - ) + self.execute_sql(f"DROP TABLE {table} SYNC") + + def drop_tables(self, *tables: str) -> None: + """Drops a set of tables if they exist""" + if not tables: + return + statements = [ + f"DROP TABLE IF EXISTS {self.make_qualified_table_name(table)} SYNC;" + for table in tables + ] + self.execute_many(statements) def _list_tables(self) -> List[str]: catalog_name, table_name = self.make_qualified_table_name_path("%", escape=False) diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index da91402803..2af27020ee 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -87,9 +87,6 @@ def rollback_transaction(self) -> None: def native_connection(self) -> "DatabricksSqlConnection": return self._conn - def drop_dataset(self) -> None: - self.execute_sql("DROP SCHEMA IF EXISTS %s CASCADE;" % self.fully_qualified_dataset_name()) - def drop_tables(self, *tables: str) -> None: # Tables are drop with `IF EXISTS`, but databricks raises when the schema doesn't exist. # Multi statement exec is safe and the error can be ignored since all tables are in the same schema. diff --git a/dlt/destinations/impl/destination/factory.py b/dlt/destinations/impl/destination/factory.py index 69bb0daa13..e307b651fb 100644 --- a/dlt/destinations/impl/destination/factory.py +++ b/dlt/destinations/impl/destination/factory.py @@ -78,8 +78,10 @@ def __init__( if callable(destination_callable): pass elif destination_callable: + if "." not in destination_callable: + raise ValueError("str destination reference must be of format 'module.function'") + module_path, attr_name = destination_callable.rsplit(".", 1) try: - module_path, attr_name = destination_callable.rsplit(".", 1) dest_module = import_module(module_path) except ModuleNotFoundError as e: raise ConfigurationValueError( diff --git a/dlt/destinations/impl/dremio/sql_client.py b/dlt/destinations/impl/dremio/sql_client.py index fac65e7fd0..929aa2a0d8 100644 --- a/dlt/destinations/impl/dremio/sql_client.py +++ b/dlt/destinations/impl/dremio/sql_client.py @@ -32,6 +32,7 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> Optional[DataFrame]: class DremioSqlClient(SqlClientBase[pydremio.DremioConnection]): dbapi: ClassVar[DBApi] = pydremio + SENTINEL_TABLE_NAME: ClassVar[str] = "_dlt_sentinel_table" def __init__( self, @@ -134,7 +135,9 @@ def is_dbapi_exception(ex: Exception) -> bool: return isinstance(ex, (pyarrow.lib.ArrowInvalid, pydremio.MalformedQueryError)) def create_dataset(self) -> None: - pass + # We create a sentinel table which defines wether we consider the dataset created + sentinel_table_name = self.make_qualified_table_name(self.SENTINEL_TABLE_NAME) + self.execute_sql(f"CREATE TABLE {sentinel_table_name} (_dlt_id BIGINT);") def _get_table_names(self) -> List[str]: query = """ @@ -147,6 +150,11 @@ def _get_table_names(self) -> List[str]: return [table[0] for table in tables] def drop_dataset(self) -> None: + # drop sentinel table + sentinel_table_name = self.make_qualified_table_name(self.SENTINEL_TABLE_NAME) + # must exist or we get undefined relation exception + self.execute_sql(f"DROP TABLE {sentinel_table_name}") + table_names = self._get_table_names() for table_name in table_names: full_table_name = self.make_qualified_table_name(table_name) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 00b990d4fa..bf443e061f 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -9,6 +9,7 @@ import dlt from dlt.common import logger, time, json, pendulum +from dlt.common.storages.fsspec_filesystem import glob_files from dlt.common.typing import DictStrAny from dlt.common.schema import Schema, TSchemaTables, TTableSchema from dlt.common.storages import FileStorage, fsspec_from_config @@ -224,7 +225,7 @@ def drop_tables(self, *tables: str, delete_schema: bool = True) -> None: self._delete_file(filename) def truncate_tables(self, table_names: List[str]) -> None: - """Truncate table with given name""" + """Truncate a set of tables with given `table_names`""" table_dirs = set(self.get_table_dirs(table_names)) table_prefixes = [self.get_table_prefix(t) for t in table_names] for table_dir in table_dirs: @@ -302,18 +303,19 @@ def list_table_files(self, table_name: str) -> List[str]: def list_files_with_prefixes(self, table_dir: str, prefixes: List[str]) -> List[str]: """returns all files in a directory that match given prefixes""" result = [] - for current_dir, _dirs, files in self.fs_client.walk(table_dir, detail=False, refresh=True): - for file in files: - # skip INIT files - if file == INIT_FILE_NAME: - continue - filepath = self.pathlib.join( - path_utils.normalize_path_sep(self.pathlib, current_dir), file - ) - for p in prefixes: - if filepath.startswith(p): - result.append(filepath) - break + # we fallback to our own glob implementation that is tested to return consistent results for + # filesystems we support. we were not able to use `find` or `walk` because they were selecting + # files wrongly (on azure walk on path1/path2/ would also select files from path1/path2_v2/ but returning wrong dirs) + for details in glob_files(self.fs_client, self.make_remote_uri(table_dir), "**"): + file = details["file_name"] + filepath = self.pathlib.join(table_dir, details["relative_path"]) + # skip INIT files + if file == INIT_FILE_NAME: + continue + for p in prefixes: + if filepath.startswith(p): + result.append(filepath) + break return result def is_storage_initialized(self) -> bool: diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py index 128e2c7e7e..79a5de7f77 100644 --- a/dlt/destinations/impl/lancedb/lancedb_client.py +++ b/dlt/destinations/impl/lancedb/lancedb_client.py @@ -12,6 +12,7 @@ Optional, Dict, Sequence, + TYPE_CHECKING, ) import lancedb # type: ignore @@ -71,6 +72,11 @@ from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.type_mapping import TypeMapper +if TYPE_CHECKING: + NDArray = ndarray[Any, Any] +else: + NDArray = ndarray + TIMESTAMP_PRECISION_TO_UNIT: Dict[int, str] = {0: "s", 3: "ms", 6: "us", 9: "ns"} UNIT_TO_TIMESTAMP_PRECISION: Dict[str, int] = {v: k for k, v in TIMESTAMP_PRECISION_TO_UNIT.items()} @@ -292,9 +298,7 @@ def delete_table(self, table_name: str) -> None: def query_table( self, table_name: str, - query: Union[ - List[Any], ndarray[Any, Any], Array, ChunkedArray, str, Tuple[Any], None - ] = None, + query: Union[List[Any], NDArray, Array, ChunkedArray, str, Tuple[Any], None] = None, ) -> LanceQueryBuilder: """Query a LanceDB table. @@ -408,8 +412,6 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns] field: TArrowField for field in arrow_schema: name = self.schema.naming.normalize_identifier(field.name) - print(field.type) - print(field.name) table_schema[name] = { "name": name, **self.type_mapper.from_db_type(field.type), @@ -453,8 +455,6 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: for table_name in only_tables or self.schema.tables: exists, existing_columns = self.get_storage_table(table_name) new_columns = self.schema.get_new_table_columns(table_name, existing_columns) - print(table_name) - print(new_columns) embedding_fields: List[str] = get_columns_names_with_prop( self.schema.get_table(table_name), VECTORIZE_HINT ) @@ -520,7 +520,6 @@ def update_schema_in_storage(self) -> None: write_disposition = self.schema.get_table(self.schema.version_table_name).get( "write_disposition" ) - print("UPLOAD") upload_batch( records, db_client=self.db_client, diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py index a360670e77..988b461fa7 100644 --- a/dlt/destinations/impl/mssql/sql_client.py +++ b/dlt/destinations/impl/mssql/sql_client.py @@ -129,7 +129,7 @@ def _drop_views(self, *tables: str) -> None: self.execute_many(statements) def _drop_schema(self) -> None: - self.execute_sql("DROP SCHEMA IF EXISTS %s;" % self.fully_qualified_dataset_name()) + self.execute_sql("DROP SCHEMA %s;" % self.fully_qualified_dataset_name()) def execute_sql( self, sql: AnyStr, *args: Any, **kwargs: Any diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index 2a5671b7e7..b0786e9ed6 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -172,13 +172,13 @@ def __init__( # decide on source format, stage_file_path will either be a local file or a bucket path if file_name.endswith("jsonl"): source_format = "( TYPE = 'JSON', BINARY_FORMAT = 'BASE64' )" - if file_name.endswith("parquet"): + elif file_name.endswith("parquet"): source_format = ( "(TYPE = 'PARQUET', BINARY_AS_TEXT = FALSE, USE_LOGICAL_TYPE = TRUE)" # TODO: USE_VECTORIZED_SCANNER inserts null strings into VARIANT JSON # " USE_VECTORIZED_SCANNER = TRUE)" ) - if file_name.endswith("csv"): + elif file_name.endswith("csv"): # empty strings are NULL, no data is NULL, missing columns (ERROR_ON_COLUMN_COUNT_MISMATCH) are NULL csv_format = config.csv_format or CsvFormatConfiguration() source_format = ( @@ -192,6 +192,8 @@ def __init__( column_match_clause = "" if csv_format.on_error_continue: on_error_clause = "ON_ERROR = CONTINUE" + else: + raise ValueError(file_name) with client.begin_transaction(): # PUT and COPY in one tx if local file, otherwise only copy diff --git a/dlt/destinations/impl/synapse/sql_client.py b/dlt/destinations/impl/synapse/sql_client.py index db1b3e7cf6..cd9a929901 100644 --- a/dlt/destinations/impl/synapse/sql_client.py +++ b/dlt/destinations/impl/synapse/sql_client.py @@ -1,12 +1,6 @@ -from typing import ClassVar from contextlib import suppress -from dlt.common.destination import DestinationCapabilitiesContext - from dlt.destinations.impl.mssql.sql_client import PyOdbcMsSqlClient -from dlt.destinations.impl.mssql.configuration import MsSqlCredentials -from dlt.destinations.impl.synapse.configuration import SynapseCredentials - from dlt.destinations.exceptions import DatabaseUndefinedRelation @@ -17,9 +11,6 @@ def drop_tables(self, *tables: str) -> None: # Synapse does not support DROP TABLE IF EXISTS. # Workaround: use DROP TABLE and suppress non-existence errors. statements = [f"DROP TABLE {self.make_qualified_table_name(table)};" for table in tables] - with suppress(DatabaseUndefinedRelation): - self.execute_fragments(statements) - - def _drop_schema(self) -> None: - # Synapse does not support DROP SCHEMA IF EXISTS. - self.execute_sql("DROP SCHEMA %s;" % self.fully_qualified_dataset_name()) + for statement in statements: + with suppress(DatabaseUndefinedRelation): + self.execute_sql(statement) diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py index 7912ac4561..f74f1b9224 100644 --- a/dlt/destinations/sql_client.py +++ b/dlt/destinations/sql_client.py @@ -98,6 +98,7 @@ def truncate_tables(self, *tables: str) -> None: self.execute_many(statements) def drop_tables(self, *tables: str) -> None: + """Drops a set of tables if they exist""" if not tables: return statements = [ diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index ad10ef3ad3..1eccd86aad 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -192,11 +192,7 @@ def decorator( # source name is passed directly or taken from decorated function name effective_name = name or get_callable_name(f) - if not schema: - # load the schema from file with name_schema.yaml/json from the same directory, the callable resides OR create new default schema - schema = _maybe_load_schema_for_callable(f, effective_name) or Schema(effective_name) - - if name and name != schema.name: + if schema and name and name != schema.name: raise ExplicitSourceNameInvalid(name, schema.name) # wrap source extraction function in configuration with section @@ -224,12 +220,19 @@ def _eval_rv(_rv: Any, schema_copy: Schema) -> TDltSourceImpl: s.root_key = root_key return s + def _make_schema() -> Schema: + if not schema: + # load the schema from file with name_schema.yaml/json from the same directory, the callable resides OR create new default schema + return _maybe_load_schema_for_callable(f, effective_name) or Schema(effective_name) + else: + # clone the schema passed to decorator, update normalizers, remove processing hints + # NOTE: source may be called several times in many different settings + return schema.clone(update_normalizers=True, remove_processing_hints=True) + @wraps(conf_f) def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: """Wrap a regular function, injection context must be a part of the wrap""" - # clone the schema passed to decorator, update normalizers, remove processing hints - # NOTE: source may be called several times in many different settings - schema_copy = schema.clone(update_normalizers=True, remove_processing_hints=True) + schema_copy = _make_schema() with Container().injectable_context(SourceSchemaInjectableContext(schema_copy)): # configurations will be accessed in this section in the source proxy = Container()[PipelineContext] @@ -249,9 +252,7 @@ async def _wrap_coro(*args: Any, **kwargs: Any) -> TDltSourceImpl: """In case of co-routine we must wrap the whole injection context in awaitable, there's no easy way to avoid some code duplication """ - # clone the schema passed to decorator, update normalizers, remove processing hints - # NOTE: source may be called several times in many different settings - schema_copy = schema.clone(update_normalizers=True, remove_processing_hints=True) + schema_copy = _make_schema() with Container().injectable_context(SourceSchemaInjectableContext(schema_copy)): # configurations will be accessed in this section in the source proxy = Container()[PipelineContext] diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 5769be1a8d..7a24b7f225 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -31,7 +31,7 @@ from dlt.common.storages.load_package import ( ParsedLoadJobFileName, LoadPackageStateInjectableContext, - TPipelineStateDoc, + TLoadPackageState, commit_load_package_state, ) from dlt.common.utils import get_callable_name, get_full_class_name @@ -45,7 +45,6 @@ from dlt.extract.storage import ExtractStorage from dlt.extract.extractors import ObjectExtractor, ArrowExtractor, Extractor from dlt.extract.utils import get_data_item_format -from dlt.pipeline.drop import drop_resources def data_to_sources( @@ -371,7 +370,7 @@ def extract( source: DltSource, max_parallel_items: int, workers: int, - load_package_state_update: Optional[Dict[str, Any]] = None, + load_package_state_update: Optional[TLoadPackageState] = None, ) -> str: # generate load package to be able to commit all the sources together later load_id = self.extract_storage.create_load_package( @@ -394,7 +393,7 @@ def extract( ) ): if load_package_state_update: - load_package.state.update(load_package_state_update) # type: ignore[typeddict-item] + load_package.state.update(load_package_state_update) # reset resource states, the `extracted` list contains all the explicit resources and all their parents for resource in source.resources.extracted.values(): diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index bc25c6fee1..11f989e0b2 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -356,6 +356,7 @@ def _join_external_scheduler(self) -> None: f"Specified Incremental last value type {param_type} is not supported. Please use" f" DateTime, Date, float, int or str to join external schedulers.({ex})" ) + return if param_type is Any: logger.warning( diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 9953b56117..7732c4f056 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -215,7 +215,7 @@ def from_data(cls, schema: Schema, section: str, data: Any) -> Self: def name(self) -> str: return self._schema.name - # TODO: 4 properties below must go somewhere else ie. into RelationalSchema which is Schema + Relational normalizer. + # TODO: max_table_nesting/root_key below must go somewhere else ie. into RelationalSchema which is Schema + Relational normalizer. @property def max_table_nesting(self) -> int: """A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON.""" @@ -223,25 +223,12 @@ def max_table_nesting(self) -> int: @max_table_nesting.setter def max_table_nesting(self, value: int) -> None: - RelationalNormalizer.update_normalizer_config(self._schema, {"max_nesting": value}) - - @property - def schema_contract(self) -> TSchemaContract: - return self.schema.settings["schema_contract"] - - @schema_contract.setter - def schema_contract(self, settings: TSchemaContract) -> None: - self.schema.set_schema_contract(settings) - - @property - def exhausted(self) -> bool: - """check all selected pipes wether one of them has started. if so, the source is exhausted.""" - for resource in self._resources.extracted.values(): - item = resource._pipe.gen - if inspect.isgenerator(item): - if inspect.getgeneratorstate(item) != "GEN_CREATED": - return True - return False + if value is None: + # this also check the normalizer type + config = RelationalNormalizer.get_normalizer_config(self._schema) + config.pop("max_nesting", None) + else: + RelationalNormalizer.update_normalizer_config(self._schema, {"max_nesting": value}) @property def root_key(self) -> bool: @@ -280,6 +267,24 @@ def root_key(self, value: bool) -> None: propagation_config = config["propagation"] propagation_config["root"].pop(data_normalizer.c_dlt_id) + @property + def schema_contract(self) -> TSchemaContract: + return self.schema.settings.get("schema_contract") + + @schema_contract.setter + def schema_contract(self, settings: TSchemaContract) -> None: + self.schema.set_schema_contract(settings) + + @property + def exhausted(self) -> bool: + """check all selected pipes wether one of them has started. if so, the source is exhausted.""" + for resource in self._resources.extracted.values(): + item = resource._pipe.gen + if inspect.isgenerator(item): + if inspect.getgeneratorstate(item) != "GEN_CREATED": + return True + return False + @property def resources(self) -> DltResourceDict: """A dictionary of all resources present in the source, where the key is a resource name.""" diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 7d7302aab6..8494d3bba3 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -11,6 +11,7 @@ RetryCallState, ) +from dlt.common.known_env import DLT_DATA_DIR, DLT_PROJECT_DIR from dlt.common.exceptions import MissingDependencyException try: @@ -121,7 +122,7 @@ def __init__( dags_folder = conf.get("core", "dags_folder") # set the dlt project folder to dags - os.environ["DLT_PROJECT_DIR"] = dags_folder + os.environ[DLT_PROJECT_DIR] = dags_folder # check if /data mount is available if use_data_folder and os.path.exists("/home/airflow/gcs/data"): @@ -129,7 +130,7 @@ def __init__( else: # create random path data_dir = os.path.join(local_data_folder or gettempdir(), f"dlt_{uniq_id(8)}") - os.environ["DLT_DATA_DIR"] = data_dir + os.environ[DLT_DATA_DIR] = data_dir # delete existing config providers in container, they will get reloaded on next use if ConfigProvidersContext in Container(): @@ -400,7 +401,7 @@ def add_run( """ # make sure that pipeline was created after dag was initialized - if not pipeline.pipelines_dir.startswith(os.environ["DLT_DATA_DIR"]): + if not pipeline.pipelines_dir.startswith(os.environ[DLT_DATA_DIR]): raise ValueError( "Please create your Pipeline instance after AirflowTasks are created. The dlt" " pipelines directory is not set correctly." diff --git a/dlt/helpers/dbt/runner.py b/dlt/helpers/dbt/runner.py index c68931d7db..266581c785 100644 --- a/dlt/helpers/dbt/runner.py +++ b/dlt/helpers/dbt/runner.py @@ -1,3 +1,4 @@ +import sys import os from subprocess import CalledProcessError import giturlparse @@ -154,12 +155,12 @@ def _run_dbt_command( try: i = iter_stdout_with_result(self.venv, "python", "-c", script) while True: - print(next(i).strip()) + sys.stdout.write(next(i).strip()) except StopIteration as si: # return result from generator return si.value # type: ignore except CalledProcessError as cpe: - print(cpe.stderr) + sys.stderr.write(cpe.stderr) raise def run( diff --git a/dlt/load/load.py b/dlt/load/load.py index 76b4806694..0e78650a84 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -382,8 +382,12 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) def load_single_package(self, load_id: str, schema: Schema) -> None: new_jobs = self.get_new_jobs_info(load_id) + # get dropped and truncated tables that were added in the extract step if refresh was requested + # NOTE: if naming convention was updated those names correspond to the old naming convention + # and they must be like that in order to drop existing tables dropped_tables = current_load_package()["state"].get("dropped_tables", []) truncated_tables = current_load_package()["state"].get("truncated_tables", []) + # initialize analytical storage ie. create dataset required by passed schema with self.get_destination_client(schema) as job_client: if (expected_update := self.load_storage.begin_schema_update(load_id)) is not None: diff --git a/dlt/load/utils.py b/dlt/load/utils.py index 7db05674fa..67a813f5f2 100644 --- a/dlt/load/utils.py +++ b/dlt/load/utils.py @@ -113,12 +113,15 @@ def init_client( ) ) + # get tables to drop + drop_table_names = {table["name"] for table in drop_tables} if drop_tables else set() + applied_update = _init_dataset_and_update_schema( job_client, expected_update, tables_with_jobs | dlt_tables, truncate_table_names, - drop_tables=drop_tables, + drop_tables=drop_table_names, ) # update the staging dataset if client supports this @@ -138,6 +141,7 @@ def init_client( staging_tables | {schema.version_table_name}, # keep only schema version staging_tables, # all eligible tables must be also truncated staging_info=True, + drop_tables=drop_table_names, # try to drop all the same tables on staging ) return applied_update @@ -149,7 +153,7 @@ def _init_dataset_and_update_schema( update_tables: Iterable[str], truncate_tables: Iterable[str] = None, staging_info: bool = False, - drop_tables: Optional[List[TTableSchema]] = None, + drop_tables: Iterable[str] = None, ) -> TSchemaTables: staging_text = "for staging dataset" if staging_info else "" logger.info( @@ -158,12 +162,17 @@ def _init_dataset_and_update_schema( ) job_client.initialize_storage() if drop_tables: - drop_table_names = [table["name"] for table in drop_tables] if hasattr(job_client, "drop_tables"): logger.info( - f"Client for {job_client.config.destination_type} will drop tables {staging_text}" + f"Client for {job_client.config.destination_type} will drop tables" + f" {drop_tables} {staging_text}" + ) + job_client.drop_tables(*drop_tables, delete_schema=True) + else: + logger.warning( + f"Client for {job_client.config.destination_type} does not implement drop table." + f" Following tables {drop_tables} will not be dropped {staging_text}" ) - job_client.drop_tables(*drop_table_names, delete_schema=True) logger.info( f"Client for {job_client.config.destination_type} will update schema to package schema" diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py index d5d4a028d9..cd50c56e09 100644 --- a/dlt/normalize/worker.py +++ b/dlt/normalize/worker.py @@ -46,6 +46,7 @@ def group_worker_files(files: Sequence[str], no_groups: int) -> List[Sequence[st remainder_l = len(chunk_files) - no_groups l_idx = 0 while remainder_l > 0: + idx = 0 for idx, file in enumerate(reversed(chunk_files.pop())): chunk_files[-l_idx - idx - remainder_l].append(file) # type: ignore remainder_l -= 1 diff --git a/dlt/pipeline/drop.py b/dlt/pipeline/drop.py index 486bead2f4..cd982cf676 100644 --- a/dlt/pipeline/drop.py +++ b/dlt/pipeline/drop.py @@ -17,6 +17,7 @@ group_tables_by_resource, compile_simple_regexes, compile_simple_regex, + has_table_seen_data, ) from dlt.common import jsonpath from dlt.common.typing import REPattern @@ -24,11 +25,11 @@ class _DropInfo(TypedDict): tables: List[str] + tables_with_data: List[str] resource_states: List[str] resource_names: List[str] state_paths: List[str] schema_name: str - dataset_name: Optional[str] drop_all: bool resource_pattern: Optional[REPattern] warnings: List[str] @@ -39,7 +40,7 @@ class _DropResult: schema: Schema state: TPipelineState info: _DropInfo - dropped_tables: List[TTableSchema] + modified_tables: List[TTableSchema] def _create_modified_state( @@ -85,12 +86,12 @@ def drop_resources( """Generate a new schema and pipeline state with the requested resources removed. Args: - schema: The schema to modify. - state: The pipeline state to modify. + schema: The schema to modify. Note that schema is changed in place. + state: The pipeline state to modify. Note that state is changed in place. resources: Resource name(s) or regex pattern(s) matching resource names to drop. If empty, no resources will be dropped unless `drop_all` is True. state_paths: JSON path(s) relative to the source state to drop. - drop_all: If True, all resources will be dropped (supeseeds `resources`). + drop_all: If True, all resources will be dropped (supersedes `resources`). state_only: If True, only modify the pipeline state, not schema sources: Only wipe state for sources matching the name(s) or regex pattern(s) in this list If not set all source states will be modified according to `state_paths` and `resources` @@ -112,9 +113,6 @@ def drop_resources( state_paths = jsonpath.compile_paths(state_paths) - schema = schema.clone() - state = deepcopy(state) - resources = set(resources) if drop_all: resource_pattern = compile_simple_regex(TSimpleRegex("re:.*")) # Match everything @@ -128,28 +126,28 @@ def drop_resources( source_pattern = compile_simple_regex(TSimpleRegex("re:.*")) # Match everything if resource_pattern: - data_tables = { - t["name"]: t for t in schema.data_tables(seen_data_only=True) - } # Don't remove _dlt tables + # (1) Don't remove _dlt tables (2) Drop all selected tables from the schema + # (3) Mark tables that seen data to be dropped in destination + data_tables = {t["name"]: t for t in schema.data_tables(include_incomplete=True)} resource_tables = group_tables_by_resource(data_tables, pattern=resource_pattern) resource_names = list(resource_tables.keys()) - # TODO: If drop_tables - if not state_only: - tables_to_drop = list(chain.from_iterable(resource_tables.values())) - tables_to_drop.reverse() - else: - tables_to_drop = [] + tables_to_drop_from_schema = list(chain.from_iterable(resource_tables.values())) + tables_to_drop_from_schema.reverse() + tables_to_drop_from_schema_names = [t["name"] for t in tables_to_drop_from_schema] + tables_to_drop_from_dest = [t for t in tables_to_drop_from_schema if has_table_seen_data(t)] else: - tables_to_drop = [] + tables_to_drop_from_schema_names = [] + tables_to_drop_from_dest = [] + tables_to_drop_from_schema = [] resource_names = [] info: _DropInfo = dict( - tables=[t["name"] for t in tables_to_drop], + tables=tables_to_drop_from_schema_names if not state_only else [], + tables_with_data=[t["name"] for t in tables_to_drop_from_dest] if not state_only else [], resource_states=[], state_paths=[], - resource_names=resource_names, + resource_names=resource_names if not state_only else [], schema_name=schema.name, - dataset_name=None, drop_all=drop_all, resource_pattern=resource_pattern, warnings=[], @@ -158,7 +156,7 @@ def drop_resources( new_state, info = _create_modified_state( state, resource_pattern, source_pattern, state_paths, info ) - info["resource_names"] = resource_names + # info["resource_names"] = resource_names if not state_only else [] if resource_pattern and not resource_tables: info["warnings"].append( @@ -167,5 +165,7 @@ def drop_resources( f" {list(group_tables_by_resource(data_tables).keys())}" ) - dropped_tables = schema.drop_tables([t["name"] for t in tables_to_drop], seen_data_only=True) - return _DropResult(schema, new_state, info, dropped_tables) + if not state_only: + # drop only the selected tables + schema.drop_tables(tables_to_drop_from_schema_names) + return _DropResult(schema, new_state, info, tables_to_drop_from_dest) diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py index 0defbc14eb..ce81b81433 100644 --- a/dlt/pipeline/helpers.py +++ b/dlt/pipeline/helpers.py @@ -12,8 +12,10 @@ from dlt.common.jsonpath import TAnyJsonPath from dlt.common.exceptions import TerminalException +from dlt.common.schema.schema import Schema from dlt.common.schema.typing import TSimpleRegex from dlt.common.pipeline import pipeline_state as current_pipeline_state, TRefreshMode +from dlt.common.storages.load_package import TLoadPackageDropTablesState from dlt.pipeline.exceptions import ( PipelineNeverRan, PipelineStepFailed, @@ -83,24 +85,24 @@ def __init__( if not pipeline.default_schema_name: raise PipelineNeverRan(pipeline.pipeline_name, pipeline.pipelines_dir) + # clone schema to keep it as original in case we need to restore pipeline schema self.schema = pipeline.schemas[schema_name or pipeline.default_schema_name].clone() drop_result = drop_resources( - # self._drop_schema, self._new_state, self.info = drop_resources( - self.schema, - pipeline.state, + # create clones to have separate schemas and state + self.schema.clone(), + deepcopy(pipeline.state), resources, state_paths, drop_all, state_only, ) - + # get modified schema and state self._new_state = drop_result.state - self.info = drop_result.info self._new_schema = drop_result.schema - self._dropped_tables = drop_result.dropped_tables - self.drop_tables = not state_only and bool(self._dropped_tables) - + self.info = drop_result.info + self._modified_tables = drop_result.modified_tables + self.drop_tables = not state_only and bool(self._modified_tables) self.drop_state = bool(drop_all or resources or state_paths) @property @@ -130,7 +132,9 @@ def __call__(self) -> None: self.pipeline._save_and_extract_state_and_schema( new_state, schema=self._new_schema, - load_package_state_update={"dropped_tables": self._dropped_tables}, + load_package_state_update=( + {"dropped_tables": self._modified_tables} if self.drop_tables else None + ), ) self.pipeline.normalize() @@ -159,30 +163,33 @@ def drop( def refresh_source( pipeline: "Pipeline", source: DltSource, refresh: TRefreshMode -) -> Dict[str, Any]: - """Run the pipeline's refresh mode on the given source, updating the source's schema and state. +) -> TLoadPackageDropTablesState: + """Run the pipeline's refresh mode on the given source, updating the provided `schema` and pipeline state. Returns: The new load package state containing tables that need to be dropped/truncated. """ - if pipeline.first_run: - return {} pipeline_state, _ = current_pipeline_state(pipeline._container) _resources_to_drop = list(source.resources.extracted) if refresh != "drop_sources" else [] + only_truncate = refresh == "drop_data" + drop_result = drop_resources( + # do not cline the schema, change in place source.schema, + # do not clone the state, change in place pipeline_state, resources=_resources_to_drop, drop_all=refresh == "drop_sources", state_paths="*" if refresh == "drop_sources" else [], + state_only=only_truncate, sources=source.name, ) - load_package_state = {} - if drop_result.dropped_tables: - key = "dropped_tables" if refresh != "drop_data" else "truncated_tables" - load_package_state[key] = drop_result.dropped_tables - if refresh != "drop_data": # drop_data is only data wipe, keep original schema - source.schema = drop_result.schema - if "sources" in drop_result.state: - pipeline_state["sources"] = drop_result.state["sources"] + load_package_state: TLoadPackageDropTablesState = {} + if drop_result.modified_tables: + if only_truncate: + load_package_state["truncated_tables"] = drop_result.modified_tables + else: + load_package_state["dropped_tables"] = drop_result.modified_tables + # if any tables should be dropped, we force state to extract + force_state_extract(pipeline_state) return load_package_state diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 2bfee3fd29..ac5d3b90e4 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1,6 +1,7 @@ import contextlib import os from contextlib import contextmanager +from copy import deepcopy, copy from functools import wraps from typing import ( Any, @@ -157,10 +158,8 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # backup and restore state should_extract_state = may_extract_state and self.config.restore_from_destination - with self.managed_state(extract_state=should_extract_state) as state: - # add the state to container as a context - with self._container.injectable_context(StateInjectableContext(state=state)): - return f(self, *args, **kwargs) + with self.managed_state(extract_state=should_extract_state): + return f(self, *args, **kwargs) return _wrap # type: ignore @@ -438,12 +437,12 @@ def extract( workers, refresh=refresh or self.refresh, ) - # extract state - if self.config.restore_from_destination: - # this will update state version hash so it will not be extracted again by with_state_sync - self._bump_version_and_extract_state( - self._container[StateInjectableContext].state, True, extract_step - ) + # this will update state version hash so it will not be extracted again by with_state_sync + self._bump_version_and_extract_state( + self._container[StateInjectableContext].state, + self.config.restore_from_destination, + extract_step, + ) # commit load packages with state extract_step.commit_packages() return self._get_step_info(extract_step) @@ -1107,8 +1106,9 @@ def _extract_source( max_parallel_items: int, workers: int, refresh: Optional[TRefreshMode] = None, - load_package_state_update: Optional[Dict[str, Any]] = None, + load_package_state_update: Optional[TLoadPackageState] = None, ) -> str: + load_package_state_update = copy(load_package_state_update or {}) # discover the existing pipeline schema try: # all live schemas are initially committed and during the extract will accumulate changes in memory @@ -1116,19 +1116,34 @@ def _extract_source( # this will (1) look for import schema if present # (2) load import schema an overwrite pipeline schema if import schema modified # (3) load pipeline schema if no import schema is present - pipeline_schema = self.schemas[source.schema.name] - pipeline_schema = pipeline_schema.clone() # use clone until extraction complete - # apply all changes in the source schema to pipeline schema - # NOTE: we do not apply contracts to changes done programmatically - pipeline_schema.update_schema(source.schema) - # replace schema in the source - source.schema = pipeline_schema + + # keep schema created by the source so we can apply changes from it later + source_schema = source.schema + # use existing pipeline schema as the source schema, clone until extraction complete + source.schema = self.schemas[source.schema.name].clone() + # refresh the pipeline schema ie. to drop certain tables before any normalizes change + if refresh: + # NOTE: we use original pipeline schema to detect dropped/truncated tables so we can drop + # the original names, before eventual new naming convention is applied + load_package_state_update.update(deepcopy(refresh_source(self, source, refresh))) + if refresh == "drop_sources": + # replace the whole source AFTER we got tables to drop + source.schema = source_schema + # NOTE: we do pass any programmatic changes from source schema to pipeline schema except settings below + # TODO: enable when we have full identifier lineage and we are able to merge table identifiers + if type(source.schema.naming) is not type(source_schema.naming): # noqa + source.schema_contract = source_schema.settings.get("schema_contract") + else: + source.schema.update_schema(source_schema) except FileNotFoundError: - pass + if refresh is not None: + logger.info( + f"Refresh flag {refresh} has no effect on source {source.name} because the" + " source is extracted for a first time" + ) - load_package_state_update = dict(load_package_state_update or {}) - if refresh: - load_package_state_update.update(refresh_source(self, source, refresh)) + # update the normalizers to detect any conflicts early + source.schema.update_normalizers() # extract into pipeline schema load_id = extract.extract( @@ -1335,9 +1350,9 @@ def _set_destinations( def _maybe_destination_capabilities( self, ) -> Iterator[DestinationCapabilitiesContext]: + caps: DestinationCapabilitiesContext = None + injected_caps: ContextManager[DestinationCapabilitiesContext] = None try: - caps: DestinationCapabilitiesContext = None - injected_caps: ContextManager[DestinationCapabilitiesContext] = None if self.destination: destination_caps = self._get_destination_capabilities() stage_caps = self._get_staging_capabilities() @@ -1504,11 +1519,15 @@ def _get_schemas_from_destination( @contextmanager def managed_state(self, *, extract_state: bool = False) -> Iterator[TPipelineState]: - # load or restore state + """Puts pipeline state in managed mode, where yielded state changes will be persisted or fully roll-backed on exception. + + Makes the state to be available via StateInjectableContext + """ state = self._get_state() - # TODO: we should backup schemas here try: - yield state + # add the state to container as a context + with self._container.injectable_context(StateInjectableContext(state=state)): + yield state except Exception: backup_state = self._get_state() # restore original pipeline props @@ -1576,7 +1595,7 @@ def _save_and_extract_state_and_schema( self, state: TPipelineState, schema: Schema, - load_package_state_update: Optional[Dict[str, Any]] = None, + load_package_state_update: Optional[TLoadPackageState] = None, ) -> None: """Save given state + schema and extract creating a new load package @@ -1601,7 +1620,7 @@ def _bump_version_and_extract_state( state: TPipelineState, extract_state: bool, extract: Extract = None, - load_package_state_update: Optional[Dict[str, Any]] = None, + load_package_state_update: Optional[TLoadPackageState] = None, schema: Optional[Schema] = None, ) -> None: """Merges existing state into `state` and extracts state using `storage` if extract_state is True. diff --git a/dlt/sources/helpers/requests/retry.py b/dlt/sources/helpers/requests/retry.py index 3f9d7d559e..7d7d6493ec 100644 --- a/dlt/sources/helpers/requests/retry.py +++ b/dlt/sources/helpers/requests/retry.py @@ -119,8 +119,8 @@ def _make_retry( retry_conds = [retry_if_status(status_codes), retry_if_exception_type(tuple(exceptions))] if condition is not None: if callable(condition): - retry_condition = [condition] - retry_conds.extend([retry_if_predicate(c) for c in retry_condition]) + condition = [condition] + retry_conds.extend([retry_if_predicate(c) for c in condition]) wait_cls = wait_exponential_retry_after if respect_retry_after_header else wait_exponential return Retrying( diff --git a/poetry.lock b/poetry.lock index 323b2188d3..a7d754f5a8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2813,6 +2813,21 @@ files = [ [package.dependencies] flake8 = ">=3.8.4" +[[package]] +name = "flake8-print" +version = "5.0.0" +description = "print statement checker plugin for flake8" +optional = false +python-versions = ">=3.7" +files = [ + {file = "flake8-print-5.0.0.tar.gz", hash = "sha256:76915a2a389cc1c0879636c219eb909c38501d3a43cc8dae542081c9ba48bdf9"}, + {file = "flake8_print-5.0.0-py3-none-any.whl", hash = "sha256:84a1a6ea10d7056b804221ac5e62b1cee1aefc897ce16f2e5c42d3046068f5d8"}, +] + +[package.dependencies] +flake8 = ">=3.0" +pycodestyle = "*" + [[package]] name = "flake8-tidy-imports" version = "4.10.0" @@ -9643,4 +9658,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "bb75ee485742aa176ad726fd468832642096145fff0543472b998e04b8b053d0" +content-hash = "1205791c3a090cf55617833ef566f1d55e6fcfa7209079bca92277f217130549" diff --git a/pyproject.toml b/pyproject.toml index 099850b6bf..6f21d17be7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,6 +154,7 @@ ruff = "^0.3.2" pyjwt = "^2.8.0" pytest-mock = "^3.14.0" types-regex = "^2024.5.15.20240519" +flake8-print = "^5.0.0" [tool.poetry.group.pipeline] optional = true diff --git a/tests/cli/common/test_cli_invoke.py b/tests/cli/common/test_cli_invoke.py index d367a97261..f856162479 100644 --- a/tests/cli/common/test_cli_invoke.py +++ b/tests/cli/common/test_cli_invoke.py @@ -6,6 +6,7 @@ from unittest.mock import patch import dlt +from dlt.common.known_env import DLT_DATA_DIR from dlt.common.configuration.paths import get_dlt_data_dir from dlt.common.runners.venv import Venv from dlt.common.utils import custom_environ, set_working_dir @@ -62,7 +63,7 @@ def test_invoke_pipeline(script_runner: ScriptRunner) -> None: shutil.copytree("tests/cli/cases/deploy_pipeline", TEST_STORAGE_ROOT, dirs_exist_ok=True) with set_working_dir(TEST_STORAGE_ROOT): - with custom_environ({"COMPETED_PROB": "1.0", "DLT_DATA_DIR": get_dlt_data_dir()}): + with custom_environ({"COMPETED_PROB": "1.0", DLT_DATA_DIR: get_dlt_data_dir()}): venv = Venv.restore_current() venv.run_script("dummy_pipeline.py") # we check output test_pipeline_command else @@ -96,7 +97,7 @@ def test_invoke_pipeline(script_runner: ScriptRunner) -> None: def test_invoke_init_chess_and_template(script_runner: ScriptRunner) -> None: with set_working_dir(TEST_STORAGE_ROOT): # store dlt data in test storage (like patch_home_dir) - with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): + with custom_environ({DLT_DATA_DIR: get_dlt_data_dir()}): result = script_runner.run(["dlt", "init", "chess", "dummy"]) assert "Verified source chess was added to your project!" in result.stdout assert result.returncode == 0 @@ -116,7 +117,7 @@ def test_invoke_list_verified_sources(script_runner: ScriptRunner) -> None: def test_invoke_deploy_project(script_runner: ScriptRunner) -> None: with set_working_dir(TEST_STORAGE_ROOT): # store dlt data in test storage (like patch_home_dir) - with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): + with custom_environ({DLT_DATA_DIR: get_dlt_data_dir()}): result = script_runner.run( ["dlt", "deploy", "debug_pipeline.py", "github-action", "--schedule", "@daily"] ) diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index ccc73a30c0..5271c68633 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -10,6 +10,7 @@ from dlt.common.configuration.container import Container from dlt.common.configuration.inject import with_config from dlt.common.configuration.exceptions import LookupTrace +from dlt.common.known_env import DLT_DATA_DIR, DLT_PROJECT_DIR from dlt.common.configuration.providers.toml import ( SECRETS_TOML, CONFIG_TOML, @@ -257,8 +258,8 @@ def test_toml_global_config() -> None: assert config._add_global_config is False # type: ignore[attr-defined] # set dlt data and settings dir - os.environ["DLT_DATA_DIR"] = "./tests/common/cases/configuration/dlt_home" - os.environ["DLT_PROJECT_DIR"] = "./tests/common/cases/configuration/" + os.environ[DLT_DATA_DIR] = "./tests/common/cases/configuration/dlt_home" + os.environ[DLT_PROJECT_DIR] = "./tests/common/cases/configuration/" # create instance with global toml enabled config = ConfigTomlProvider(add_global_config=True) assert config._add_global_config is True diff --git a/tests/common/schema/test_normalize_identifiers.py b/tests/common/schema/test_normalize_identifiers.py index 60f8c04604..646a693ea6 100644 --- a/tests/common/schema/test_normalize_identifiers.py +++ b/tests/common/schema/test_normalize_identifiers.py @@ -352,7 +352,9 @@ def test_raise_on_change_identifier_table_with_data() -> None: os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper" with pytest.raises(TableIdentifiersFrozen) as fr_ex: schema.update_normalizers() - assert fr_ex.value.table_name == "issues" + # _dlt_version is the first table to be normalized, and since there are tables + # that have seen data, we consider _dlt_version also be materialized + assert fr_ex.value.table_name == "_dlt_version" assert isinstance(fr_ex.value.from_naming, snake_case.NamingConvention) assert isinstance(fr_ex.value.to_naming, sql_upper.NamingConvention) # try again, get exception (schema was not partially modified) diff --git a/tests/common/test_json.py b/tests/common/test_json.py index 79037ebf93..b7d25589a7 100644 --- a/tests/common/test_json.py +++ b/tests/common/test_json.py @@ -6,6 +6,7 @@ from dlt.common import json, Decimal, pendulum from dlt.common.arithmetics import numeric_default_context +from dlt.common import known_env from dlt.common.json import ( _DECIMAL, _WEI, @@ -306,7 +307,7 @@ def test_garbage_pua_string(json_impl: SupportsJson) -> None: def test_change_pua_start() -> None: import inspect - os.environ["DLT_JSON_TYPED_PUA_START"] = "0x0FA179" + os.environ[known_env.DLT_JSON_TYPED_PUA_START] = "0x0FA179" from importlib import reload try: @@ -316,7 +317,7 @@ def test_change_pua_start() -> None: assert MOD_PUA_START == int("0x0FA179", 16) finally: # restore old start - os.environ["DLT_JSON_TYPED_PUA_START"] = hex(PUA_START) + os.environ[known_env.DLT_JSON_TYPED_PUA_START] = hex(PUA_START) from importlib import reload reload(inspect.getmodule(SupportsJson)) diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 8287da69d4..a170c6977d 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -39,6 +39,39 @@ def switch_to_fifo(): del os.environ["EXTRACT__NEXT_ITEM_MODE"] +def test_basic_source() -> None: + def basic_gen(): + yield 1 + + schema = Schema("test") + s = DltSource.from_data(schema, "section", basic_gen) + assert s.name == "test" + assert s.section == "section" + assert s.max_table_nesting is None + assert s.root_key is False + assert s.schema_contract is None + assert s.exhausted is False + assert s.schema is schema + assert len(s.resources) == 1 + assert s.resources == s.selected_resources + + # set some props + s.max_table_nesting = 10 + assert s.max_table_nesting == 10 + s.root_key = True + assert s.root_key is True + s.schema_contract = "evolve" + assert s.schema_contract == "evolve" + + s.max_table_nesting = None + s.root_key = False + s.schema_contract = None + + assert s.max_table_nesting is None + assert s.root_key is False + assert s.schema_contract is None + + def test_call_data_resource() -> None: with pytest.raises(TypeError): DltResource.from_data([1], name="t")() diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index a7b1371f9f..3cad7dda2c 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -1,9 +1,10 @@ import os import posixpath -from typing import Union, Dict +from typing import Tuple, Union, Dict from urllib.parse import urlparse +from fsspec import AbstractFileSystem import pytest from tenacity import retry, stop_after_attempt, wait_fixed @@ -21,7 +22,7 @@ FilesystemDestinationClientConfiguration, ) from dlt.destinations.impl.filesystem.typing import TExtraPlaceholders -from tests.common.storages.utils import assert_sample_files +from tests.common.storages.utils import TEST_SAMPLE_FILES, assert_sample_files from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET from tests.utils import autouse_test_storage from .utils import self_signed_cert @@ -98,27 +99,26 @@ def check_file_changed(file_url_: str): @pytest.mark.parametrize("load_content", (True, False)) @pytest.mark.parametrize("glob_filter", ("**", "**/*.csv", "*.txt", "met_csv/A803/*.csv")) -def test_filesystem_dict( - with_gdrive_buckets_env: str, load_content: bool, glob_filter: str -) -> None: +def test_glob_files(with_gdrive_buckets_env: str, load_content: bool, glob_filter: str) -> None: bucket_url = os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] - config = get_config() - # enable caches - config.read_only = True - if config.protocol in ["memory", "file"]: - pytest.skip(f"{config.protocol} not supported in this test") - glob_folder = "standard_source/samples" - # may contain query string - bucket_url_parsed = urlparse(bucket_url) - bucket_url = bucket_url_parsed._replace( - path=posixpath.join(bucket_url_parsed.path, glob_folder) - ).geturl() - filesystem, _ = fsspec_from_config(config) + bucket_url, config, filesystem = glob_test_setup(bucket_url, "standard_source/samples") # use glob to get data all_file_items = list(glob_files(filesystem, bucket_url, glob_filter)) + # assert len(all_file_items) == 0 assert_sample_files(all_file_items, filesystem, config, load_content, glob_filter) +def test_glob_overlapping_path_files(with_gdrive_buckets_env: str) -> None: + bucket_url = os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] + # "standard_source/sample" overlaps with a real existing "standard_source/samples". walk operation on azure + # will return all files from "standard_source/samples" and report the wrong "standard_source/sample" path to the user + # here we test we do not have this problem with out glob + bucket_url, _, filesystem = glob_test_setup(bucket_url, "standard_source/sample") + # use glob to get data + all_file_items = list(glob_files(filesystem, bucket_url)) + assert len(all_file_items) == 0 + + @pytest.mark.skipif("s3" not in ALL_FILESYSTEM_DRIVERS, reason="s3 destination not configured") def test_filesystem_instance_from_s3_endpoint(environment: Dict[str, str]) -> None: """Test that fsspec instance is correctly configured when using endpoint URL. @@ -264,3 +264,26 @@ def test_filesystem_destination_passed_parameters_override_config_values() -> No bound_config = filesystem_destination.configuration(filesystem_config) assert bound_config.current_datetime == config_now assert bound_config.extra_placeholders == config_extra_placeholders + + +def glob_test_setup( + bucket_url: str, glob_folder: str +) -> Tuple[str, FilesystemConfiguration, AbstractFileSystem]: + config = get_config() + # enable caches + config.read_only = True + if config.protocol in ["file"]: + pytest.skip(f"{config.protocol} not supported in this test") + + # may contain query string + bucket_url_parsed = urlparse(bucket_url) + bucket_url = bucket_url_parsed._replace( + path=posixpath.join(bucket_url_parsed.path, glob_folder) + ).geturl() + filesystem, _ = fsspec_from_config(config) + if config.protocol == "memory": + mem_path = os.path.join("m", "standard_source") + if not filesystem.isdir(mem_path): + filesystem.mkdirs(mem_path) + filesystem.upload(TEST_SAMPLE_FILES, mem_path, recursive=True) + return bucket_url, config, filesystem diff --git a/tests/load/lancedb/test_pipeline.py b/tests/load/lancedb/test_pipeline.py index a89153f629..e817a2f6c8 100644 --- a/tests/load/lancedb/test_pipeline.py +++ b/tests/load/lancedb/test_pipeline.py @@ -374,7 +374,7 @@ def some_data() -> Generator[List[DictStrAny], Any, None]: def test_merge_github_nested() -> None: - pipe = dlt.pipeline(destination="lancedb", dataset_name="github1", full_refresh=True) + pipe = dlt.pipeline(destination="lancedb", dataset_name="github1", dev_mode=True) assert pipe.dataset_name.startswith("github1_202") with open( @@ -422,7 +422,7 @@ def test_merge_github_nested() -> None: def test_empty_dataset_allowed() -> None: # dataset_name is optional so dataset name won't be autogenerated when not explicitly passed. - pipe = dlt.pipeline(destination="lancedb", full_refresh=True) + pipe = dlt.pipeline(destination="lancedb", dev_mode=True) client: LanceDBClient = pipe.destination_client() # type: ignore[assignment] assert pipe.dataset_name is None diff --git a/tests/load/pipeline/test_bigquery.py b/tests/load/pipeline/test_bigquery.py index 0618ff9d3d..f4fdef8665 100644 --- a/tests/load/pipeline/test_bigquery.py +++ b/tests/load/pipeline/test_bigquery.py @@ -15,7 +15,7 @@ ids=lambda x: x.name, ) def test_bigquery_numeric_types(destination_config: DestinationTestConfiguration) -> None: - pipeline = destination_config.setup_pipeline("test_bigquery_numeric_types") + pipeline = destination_config.setup_pipeline("test_bigquery_numeric_types", dev_mode=True) columns = [ {"name": "col_big_numeric", "data_type": "decimal", "precision": 47, "scale": 9}, diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 210ad76b8a..3f0352cab7 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -601,9 +601,11 @@ def _collect_files(p) -> List[str]: found.append(os.path.join(basedir, file).replace(client.dataset_path, "")) return found - def _collect_table_counts(p) -> Dict[str, int]: + def _collect_table_counts(p, *items: str) -> Dict[str, int]: + expected_items = set(items).intersection({"items", "items2", "items3"}) + print(expected_items) return load_table_counts( - p, "items", "items2", "items3", "_dlt_loads", "_dlt_version", "_dlt_pipeline_state" + p, *expected_items, "_dlt_loads", "_dlt_version", "_dlt_pipeline_state" ) # generate 4 loads from 2 pipelines, store load ids @@ -616,7 +618,7 @@ def _collect_table_counts(p) -> Dict[str, int]: # first two loads p1.run([1, 2, 3], table_name="items").loads_ids[0] load_id_2_1 = p2.run([4, 5, 6], table_name="items").loads_ids[0] - assert _collect_table_counts(p1) == { + assert _collect_table_counts(p1, "items") == { "items": 6, "_dlt_loads": 2, "_dlt_pipeline_state": 2, @@ -643,7 +645,7 @@ def some_data(): p2.run([4, 5, 6], table_name="items").loads_ids[0] # no migration here # 4 loads for 2 pipelines, one schema and state change on p2 changes so 3 versions and 3 states - assert _collect_table_counts(p1) == { + assert _collect_table_counts(p1, "items", "items2") == { "items": 9, "items2": 3, "_dlt_loads": 4, diff --git a/tests/load/pipeline/test_refresh_modes.py b/tests/load/pipeline/test_refresh_modes.py index de557ba118..f4bf3b0311 100644 --- a/tests/load/pipeline/test_refresh_modes.py +++ b/tests/load/pipeline/test_refresh_modes.py @@ -2,21 +2,30 @@ import pytest import dlt +from dlt.common.destination.exceptions import DestinationUndefinedEntity from dlt.common.pipeline import resource_state -from dlt.destinations.sql_client import DBApiCursor -from dlt.pipeline.state_sync import load_pipeline_state_from_destination +from dlt.common.utils import uniq_id from dlt.common.typing import DictStrAny from dlt.common.pipeline import pipeline_state as current_pipeline_state +from dlt.destinations.sql_client import DBApiCursor +from dlt.extract.source import DltSource +from dlt.pipeline.state_sync import load_pipeline_state_from_destination + from tests.utils import clean_test_storage from tests.pipeline.utils import ( + _is_filesystem, assert_load_info, + load_table_counts, load_tables_to_dicts, assert_only_table_columns, table_exists, ) from tests.load.utils import destinations_configs, DestinationTestConfiguration +# mark all tests as essential, do not remove +pytestmark = pytest.mark.essential + def assert_source_state_is_wiped(state: DictStrAny) -> None: # Keys contains only "resources" or is empty @@ -66,7 +75,7 @@ def some_data_2(): yield {"id": 7} yield {"id": 8} - @dlt.resource + @dlt.resource(primary_key="id", write_disposition="merge") def some_data_3(): if first_run: dlt.state()["source_key_3"] = "source_value_3" @@ -103,7 +112,6 @@ def test_refresh_drop_sources(destination_config: DestinationTestConfiguration): # First run pipeline so destination so tables are created info = pipeline.run(refresh_source(first_run=True, drop_sources=True)) assert_load_info(info) - # Second run of pipeline with only selected resources info = pipeline.run( refresh_source(first_run=False, drop_sources=True).with_resources( @@ -114,8 +122,6 @@ def test_refresh_drop_sources(destination_config: DestinationTestConfiguration): assert set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) == { "some_data_1", "some_data_2", - # Table has never seen data and is not dropped - "some_data_4", } # No "name" column should exist as table was dropped and re-created without it @@ -163,7 +169,7 @@ def test_existing_schema_hash(destination_config: DestinationTestConfiguration): new_table_names = set( t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True) ) - assert new_table_names == {"some_data_1", "some_data_2", "some_data_4"} + assert new_table_names == {"some_data_1", "some_data_2"} # Run again with all tables to ensure they are re-created # The new schema in this case should match the schema of the first run exactly @@ -430,10 +436,76 @@ def test_refresh_argument_to_extract(destination_config: DestinationTestConfigur tables = set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) # All other data tables removed - assert tables == {"some_data_3", "some_data_4"} + assert tables == {"some_data_3"} # Run again without refresh to confirm refresh option doesn't persist on pipeline pipeline.extract(refresh_source(first_run=False).with_resources("some_data_2")) tables = set(t["name"] for t in pipeline.default_schema.data_tables(include_incomplete=True)) - assert tables == {"some_data_2", "some_data_3", "some_data_4"} + assert tables == {"some_data_2", "some_data_3"} + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, default_staging_configs=True, all_buckets_filesystem_configs=True + ), + ids=lambda x: x.name, +) +def test_refresh_staging_dataset(destination_config: DestinationTestConfiguration): + data = [ + {"id": 1, "pop": 1}, + {"id": 2, "pop": 3}, + {"id": 2, "pop": 4}, # duplicate + ] + + pipeline = destination_config.setup_pipeline("test_refresh_staging_dataset" + uniq_id()) + + source = DltSource( + dlt.Schema("data_x"), + "data_section", + [ + dlt.resource(data, name="data_1", primary_key="id", write_disposition="merge"), + dlt.resource(data, name="data_2", primary_key="id", write_disposition="append"), + ], + ) + # create two tables so two tables need to be dropped + info = pipeline.run(source) + assert_load_info(info) + + # make data so inserting on mangled tables is not possible + data_i = [ + {"id": "A", "pop": 0.1}, + {"id": "B", "pop": 0.3}, + {"id": "A", "pop": 0.4}, + ] + source_i = DltSource( + dlt.Schema("data_x"), + "data_section", + [ + dlt.resource(data_i, name="data_1", primary_key="id", write_disposition="merge"), + dlt.resource(data_i, name="data_2", primary_key="id", write_disposition="append"), + ], + ) + info = pipeline.run(source_i, refresh="drop_resources") + assert_load_info(info) + + # now replace the whole source and load different tables + source_i = DltSource( + dlt.Schema("data_x"), + "data_section", + [ + dlt.resource(data_i, name="data_1_v2", primary_key="id", write_disposition="merge"), + dlt.resource(data_i, name="data_2_v2", primary_key="id", write_disposition="append"), + ], + ) + info = pipeline.run(source_i, refresh="drop_sources") + assert_load_info(info) + + # tables got dropped + if _is_filesystem(pipeline): + assert load_table_counts(pipeline, "data_1", "data_2") == {} + else: + with pytest.raises(DestinationUndefinedEntity): + load_table_counts(pipeline, "data_1", "data_2") + load_table_counts(pipeline, "data_1_v2", "data_1_v2") diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 35b988d46e..69f6bd4cc4 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -338,10 +338,11 @@ def test_drop_tables(client: SqlJobClientBase) -> None: # Drop tables from the first schema client.schema = schema tables_to_drop = ["event_slot", "event_user"] - for tbl in tables_to_drop: - del schema.tables[tbl] + schema.drop_tables(tables_to_drop) schema._bump_version() - client.drop_tables(*tables_to_drop) + + # add one fake table to make sure one table can be ignored + client.drop_tables(tables_to_drop[0], "not_exists", *tables_to_drop[1:]) client._update_schema_in_storage(schema) # Schema was deleted, load it in again if isinstance(client, WithStagingDataset): with contextlib.suppress(DatabaseUndefinedRelation): diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index fa31f1db65..8d4e146034 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -112,6 +112,28 @@ def test_malformed_query_parameters(client: SqlJobClientBase) -> None: assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) +def test_has_dataset(client: SqlJobClientBase) -> None: + with client.sql_client.with_alternative_dataset_name("not_existing"): + assert not client.sql_client.has_dataset() + client.update_stored_schema() + assert client.sql_client.has_dataset() + + +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) +def test_create_drop_dataset(client: SqlJobClientBase) -> None: + # client.sql_client.create_dataset() + with pytest.raises(DatabaseException): + client.sql_client.create_dataset() + client.sql_client.drop_dataset() + with pytest.raises(DatabaseUndefinedRelation): + client.sql_client.drop_dataset() + + @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) diff --git a/tests/load/utils.py b/tests/load/utils.py index 9ee933a07a..95083b7d31 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -718,7 +718,8 @@ def yield_client_with_storage( ) as client: client.initialize_storage() yield client - client.sql_client.drop_dataset() + if client.is_storage_initialized(): + client.sql_client.drop_dataset() if isinstance(client, WithStagingDataset): with client.with_staging_dataset(): if client.is_storage_initialized(): diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index ba7c0b9db8..979bdd0e37 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -3,10 +3,12 @@ import pytest import tempfile import shutil +from unittest.mock import patch from importlib.metadata import version as pkg_version import dlt from dlt.common import json, pendulum +from dlt.common.known_env import DLT_DATA_DIR from dlt.common.json import custom_pua_decode from dlt.common.runners import Venv from dlt.common.storages.exceptions import StorageMigrationError @@ -24,9 +26,49 @@ from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient -from tests.pipeline.utils import load_table_counts +from tests.pipeline.utils import airtable_emojis, load_table_counts from tests.utils import TEST_STORAGE_ROOT, test_storage + +def test_simulate_default_naming_convention_change() -> None: + # checks that (future) change in the naming convention won't affect existing pipelines + pipeline = dlt.pipeline("simulated_snake_case", destination="duckdb") + assert pipeline.naming.name() == "snake_case" + info = pipeline.run( + airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock") + ) + info.raise_on_failed_jobs() + # normalized names + assert pipeline.last_trace.last_normalize_info.row_counts["_schedule"] == 3 + assert "_schedule" in pipeline.default_schema.tables + + # mock the mod + # from dlt.common.normalizers import utils + + with patch("dlt.common.normalizers.utils.DEFAULT_NAMING_MODULE", "duck_case"): + duck_pipeline = dlt.pipeline("simulated_duck_case", destination="duckdb") + assert duck_pipeline.naming.name() == "duck_case" + print(airtable_emojis().schema.naming.name()) + + # run new and old pipelines + info = duck_pipeline.run( + airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock") + ) + info.raise_on_failed_jobs() + print(duck_pipeline.last_trace.last_normalize_info.row_counts) + assert duck_pipeline.last_trace.last_normalize_info.row_counts["📆 Schedule"] == 3 + assert "📆 Schedule" in duck_pipeline.default_schema.tables + + # old pipeline should keep its naming convention + info = pipeline.run( + airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock") + ) + info.raise_on_failed_jobs() + # normalized names + assert pipeline.last_trace.last_normalize_info.row_counts["_schedule"] == 3 + assert pipeline.naming.name() == "snake_case" + + if sys.version_info >= (3, 12): pytest.skip("Does not run on Python 3.12 and later", allow_module_level=True) @@ -41,7 +83,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: # execute in test storage with set_working_dir(TEST_STORAGE_ROOT): # store dlt data in test storage (like patch_home_dir) - with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): + with custom_environ({DLT_DATA_DIR: get_dlt_data_dir()}): # save database outside of pipeline dir with custom_environ( {"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"} @@ -175,7 +217,7 @@ def test_filesystem_pipeline_with_dlt_update(test_storage: FileStorage) -> None: # execute in test storage with set_working_dir(TEST_STORAGE_ROOT): # store dlt data in test storage (like patch_home_dir) - with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): + with custom_environ({DLT_DATA_DIR: get_dlt_data_dir()}): # create virtual env with (0.4.9) where filesystem started to store state with Venv.create(tempfile.mkdtemp(), ["dlt==0.4.9"]) as venv: try: @@ -247,7 +289,7 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: # execute in test storage with set_working_dir(TEST_STORAGE_ROOT): # store dlt data in test storage (like patch_home_dir) - with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): + with custom_environ({DLT_DATA_DIR: get_dlt_data_dir()}): # save database outside of pipeline dir with custom_environ( {"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"} @@ -322,7 +364,7 @@ def test_normalize_package_with_dlt_update(test_storage: FileStorage) -> None: # execute in test storage with set_working_dir(TEST_STORAGE_ROOT): # store dlt data in test storage (like patch_home_dir) - with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): + with custom_environ({DLT_DATA_DIR: get_dlt_data_dir()}): # save database outside of pipeline dir with custom_environ( {"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"} diff --git a/tests/pipeline/test_drop_helpers.py b/tests/pipeline/test_drop_helpers.py new file mode 100644 index 0000000000..9a09d9f866 --- /dev/null +++ b/tests/pipeline/test_drop_helpers.py @@ -0,0 +1,209 @@ +import pytest +from copy import deepcopy + +import dlt +from dlt.common.schema.typing import LOADS_TABLE_NAME, PIPELINE_STATE_TABLE_NAME, VERSION_TABLE_NAME +from dlt.common.versioned_state import decompress_state +from dlt.pipeline.drop import drop_resources +from dlt.pipeline.helpers import DropCommand, refresh_source + +from tests.pipeline.utils import airtable_emojis, assert_load_info + + +@pytest.mark.parametrize("seen_data", [True, False], ids=["seen_data", "no_data"]) +def test_drop_helper_utils(seen_data: bool) -> None: + pipeline = dlt.pipeline("test_drop_helpers_no_table_drop", destination="duckdb") + # extract first which should produce tables that didn't seen data + source = airtable_emojis().with_resources( + "📆 Schedule", "🦚Peacock", "🦚WidePeacock", "💰Budget" + ) + if seen_data: + pipeline.run(source) + else: + pipeline.extract(source) + + # drop nothing + drop_info = drop_resources(pipeline.default_schema.clone(), pipeline.state) + assert drop_info.modified_tables == [] + assert drop_info.info["tables"] == [] + + # drop all resources + drop_info = drop_resources(pipeline.default_schema.clone(), pipeline.state, drop_all=True) + # no tables to drop + tables_to_drop = ( + {"_schedule", "_peacock", "_wide_peacock", "_peacock__peacock", "_wide_peacock__peacock"} + if seen_data + else set() + ) + tables_to_drop_schema = ( + tables_to_drop if seen_data else {"_schedule", "_peacock", "_wide_peacock"} + ) + assert {t["name"] for t in drop_info.modified_tables} == tables_to_drop + # no state mods + assert drop_info.state["sources"]["airtable_emojis"] == {"resources": {}} + assert set(drop_info.info["tables"]) == tables_to_drop_schema + assert set(drop_info.info["tables_with_data"]) == tables_to_drop + # all tables got dropped + assert drop_info.schema.data_tables(include_incomplete=True) == [] + # dlt tables still there + assert set(drop_info.schema.dlt_table_names()) == { + VERSION_TABLE_NAME, + LOADS_TABLE_NAME, + PIPELINE_STATE_TABLE_NAME, + } + # same but with refresh + source_clone = source.clone() + source_clone.schema = pipeline.default_schema.clone() + with pipeline.managed_state() as state: + emoji_state = deepcopy(state["sources"]["airtable_emojis"]) + package_state = refresh_source(pipeline, source_clone, refresh="drop_sources") + # managed state modified + assert state["sources"]["airtable_emojis"] == {"resources": {}} + # restore old state for next tests + state["sources"]["airtable_emojis"] = emoji_state + if seen_data: + assert {t["name"] for t in package_state["dropped_tables"]} == tables_to_drop + else: + assert package_state == {} + assert source_clone.schema.data_tables(include_incomplete=True) == [] + + # drop only selected resources + tables_to_drop = {"_schedule"} if seen_data else set() + # seen_data means full run so we generate child tables in that case + left_in_schema = ( + {"_peacock", "_wide_peacock", "_peacock__peacock", "_wide_peacock__peacock"} + if seen_data + else {"_peacock", "_wide_peacock"} + ) + drop_info = drop_resources( + pipeline.default_schema.clone(), pipeline.state, resources=["📆 Schedule"] + ) + assert set(t["name"] for t in drop_info.modified_tables) == tables_to_drop + # no changes in state + assert drop_info.state == pipeline.state + assert set(drop_info.info["tables"]) == {"_schedule"} + assert set(drop_info.schema.data_table_names(include_incomplete=True)) == left_in_schema + source_clone = source_clone.with_resources("📆 Schedule") + source_clone.schema = pipeline.default_schema.clone() + with pipeline.managed_state() as state: + package_state = refresh_source(pipeline, source_clone, refresh="drop_resources") + # state not modified + assert state["sources"]["airtable_emojis"] == {"resources": {"🦚Peacock": {"🦚🦚🦚": "🦚"}}} + if seen_data: + assert {t["name"] for t in package_state["dropped_tables"]} == tables_to_drop + else: + assert package_state == {} + assert set(source_clone.schema.data_table_names(include_incomplete=True)) == left_in_schema + + # truncate only + tables_to_truncate = ( + {"_peacock", "_wide_peacock", "_peacock__peacock", "_wide_peacock__peacock"} + if seen_data + else set() + ) + all_in_schema = ( + {"_schedule", "_peacock", "_wide_peacock", "_peacock__peacock", "_wide_peacock__peacock"} + if seen_data + else {"_schedule", "_peacock", "_wide_peacock"} + ) + drop_info = drop_resources( + pipeline.default_schema.clone(), + pipeline.state, + resources=["🦚Peacock", "🦚WidePeacock"], + state_only=True, + ) + assert set(t["name"] for t in drop_info.modified_tables) == tables_to_truncate + # state is modified + assert drop_info.state["sources"]["airtable_emojis"] == {"resources": {}} + assert drop_info.info["tables"] == [] + # no tables with data will be dropped + assert drop_info.info["tables_with_data"] == [] + assert set(drop_info.schema.data_table_names(include_incomplete=True)) == all_in_schema + source_clone = source_clone.with_resources("🦚Peacock", "🦚WidePeacock") + source_clone.schema = pipeline.default_schema.clone() + with pipeline.managed_state() as state: + package_state = refresh_source(pipeline, source_clone, refresh="drop_data") + # state modified + assert state["sources"]["airtable_emojis"] == {"resources": {}} + if seen_data: + assert {t["name"] for t in package_state["truncated_tables"]} == tables_to_truncate + else: + assert package_state == {} + assert set(source_clone.schema.data_table_names(include_incomplete=True)) == all_in_schema + + +def test_drop_unknown_resource() -> None: + pipeline = dlt.pipeline("test_drop_unknown_resource", destination="duckdb") + # extract first which should produce tables that didn't seen data + source = airtable_emojis().with_resources( + "📆 Schedule", "🦚Peacock", "🦚WidePeacock", "💰Budget" + ) + info = pipeline.run(source) + assert_load_info(info) + drop = DropCommand(pipeline, resources=["💰Budget"]) + assert drop.is_empty + + source.schema = pipeline.default_schema + package_state = refresh_source( + pipeline, source.with_resources("💰Budget"), refresh="drop_resources" + ) + assert package_state == {} + + info = pipeline.run(source.with_resources("💰Budget"), refresh="drop_resources") + # nothing loaded + assert_load_info(info, 0) + + +def test_modified_state_in_package() -> None: + pipeline = dlt.pipeline("test_modified_state_in_package", destination="duckdb") + # extract first which should produce tables that didn't seen data + source = airtable_emojis().with_resources( + "📆 Schedule", "🦚Peacock", "🦚WidePeacock", "💰Budget" + ) + pipeline.extract(source) + # run again to change peacock state again + info = pipeline.extract(source) + normalize_storage = pipeline._get_normalize_storage() + package_state = normalize_storage.extracted_packages.get_load_package_state(info.loads_ids[0]) + pipeline_state = decompress_state(package_state["pipeline_state"]["state"]) + assert pipeline_state["sources"]["airtable_emojis"] == { + "resources": {"🦚Peacock": {"🦚🦚🦚": "🦚🦚"}} + } + + # remove state + info = pipeline.extract(airtable_emojis().with_resources("🦚Peacock"), refresh="drop_resources") + normalize_storage = pipeline._get_normalize_storage() + package_state = normalize_storage.extracted_packages.get_load_package_state(info.loads_ids[0]) + # nothing to drop + assert "dropped_tables" not in package_state + pipeline_state = decompress_state(package_state["pipeline_state"]["state"]) + # the state was reset to the original + assert pipeline_state["sources"]["airtable_emojis"] == { + "resources": {"🦚Peacock": {"🦚🦚🦚": "🦚"}} + } + + +def test_drop_tables_force_extract_state() -> None: + # if any tables will be dropped, state must be extracted even if it is not changed + pipeline = dlt.pipeline("test_drop_tables_force_extract_state", destination="duckdb") + source = airtable_emojis().with_resources( + "📆 Schedule", "🦚Peacock", "🦚WidePeacock", "💰Budget" + ) + info = pipeline.run(source) + assert_load_info(info) + # dropping schedule should not change the state + info = pipeline.run(airtable_emojis().with_resources("📆 Schedule"), refresh="drop_resources") + assert_load_info(info) + storage = pipeline._get_load_storage() + package_state = storage.get_load_package_state(info.loads_ids[0]) + assert package_state["dropped_tables"][0]["name"] == "_schedule" + assert "pipeline_state" in package_state + + # here we drop and set state to original, so without forcing state extract state would not be present + info = pipeline.run(airtable_emojis().with_resources("🦚Peacock"), refresh="drop_resources") + assert_load_info(info) + storage = pipeline._get_load_storage() + package_state = storage.get_load_package_state(info.loads_ids[0]) + # child table also dropped + assert len(package_state["dropped_tables"]) == 2 + assert "pipeline_state" in package_state diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 6a6bf4bde1..328119970a 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -2271,21 +2271,57 @@ def test_change_naming_convention_name_collision() -> None: os.environ["SOURCES__AIRTABLE_EMOJIS__SCHEMA__NAMING"] = "sql_ci_v1" with pytest.raises(PipelineStepFailed) as pip_ex: pipeline.run(airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock")) + # see conflicts early + assert pip_ex.value.step == "extract" assert isinstance(pip_ex.value.__cause__, TableIdentifiersFrozen) # all good if we drop tables - # info = pipeline.run( - # airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"), - # refresh="drop_resources", - # ) - # assert_load_info(info) - # assert load_data_table_counts(pipeline) == { - # "📆 Schedule": 3, - # "🦚Peacock": 1, - # "🦚WidePeacock": 1, - # "🦚Peacock__peacock": 3, - # "🦚WidePeacock__Peacock": 3, - # } + info = pipeline.run( + airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"), + refresh="drop_resources", + ) + assert_load_info(info) + # case insensitive normalization + assert load_data_table_counts(pipeline) == { + "_schedule": 3, + "_peacock": 1, + "_widepeacock": 1, + "_peacock__peacock": 3, + "_widepeacock__peacock": 3, + } + + +def test_change_to_more_lax_naming_convention_name_collision() -> None: + # use snake_case which is strict and then change to duck_case which accepts snake_case names without any changes + # still we want to detect collisions + pipeline = dlt.pipeline( + "test_change_to_more_lax_naming_convention_name_collision", destination="duckdb" + ) + info = pipeline.run( + airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock") + ) + assert_load_info(info) + assert "_peacock" in pipeline.default_schema.tables + + # use duck case to load data into duckdb so casing and emoji are preserved + duck_ = dlt.destinations.duckdb(naming_convention="duck_case") + + # changing destination to one with a separate naming convention raises immediately + with pytest.raises(TableIdentifiersFrozen): + pipeline.run( + airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"), + destination=duck_, + ) + + # refresh on the source level will work + info = pipeline.run( + airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock"), + destination=duck_, + refresh="drop_sources", + ) + assert_load_info(info) + # make sure that emojis got in + assert "🦚Peacock" in pipeline.default_schema.tables def test_change_naming_convention_column_collision() -> None: diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index c10618a7cc..f2e0058891 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -6,6 +6,7 @@ import dlt from dlt.common import json, sleep +from dlt.common.destination.exceptions import DestinationUndefinedEntity from dlt.common.pipeline import LoadInfo from dlt.common.schema.utils import get_table_format from dlt.common.typing import DictStrAny @@ -47,7 +48,9 @@ def budget(): @dlt.resource(name="🦚Peacock", selected=False, primary_key="🔑id") def peacock(): - dlt.current.resource_state()["🦚🦚🦚"] = "🦚" + r_state = dlt.current.resource_state() + r_state.setdefault("🦚🦚🦚", "") + r_state["🦚🦚🦚"] += "🦚" yield [{"peacock": [1, 2, 3], "🔑id": 1}] @dlt.resource(name="🦚WidePeacock", selected=False) diff --git a/tox.ini b/tox.ini index ed6c69c585..059f6a586a 100644 --- a/tox.ini +++ b/tox.ini @@ -7,3 +7,6 @@ banned-modules = datetime = use dlt.common.pendulum open = use dlt.common.open pendulum = use dlt.common.pendulum extend-immutable-calls = dlt.sources.incremental +per-file-ignores = + tests/*: T20 + docs/*: T20 \ No newline at end of file From 5d23e21d1fa6b2fc4db09b7c89e372184df9fc65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Willi=20M=C3=BCller?= Date: Fri, 5 Jul 2024 18:47:32 +0530 Subject: [PATCH 55/61] rest client: makes request parameters optional so that client.post() works also without query string (#1544) --- dlt/sources/helpers/rest_client/client.py | 2 +- tests/sources/helpers/rest_client/conftest.py | 22 +++++++++++++ .../helpers/rest_client/test_client.py | 33 ++++++++++++++++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index e6135b5c0f..2cc19f6624 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -95,7 +95,7 @@ def _create_request( self, path: str, method: HTTPMethod, - params: Dict[str, Any], + params: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, auth: Optional[AuthBase] = None, hooks: Optional[Hooks] = None, diff --git a/tests/sources/helpers/rest_client/conftest.py b/tests/sources/helpers/rest_client/conftest.py index 08233bc3a8..b56ef6bbbf 100644 --- a/tests/sources/helpers/rest_client/conftest.py +++ b/tests/sources/helpers/rest_client/conftest.py @@ -171,6 +171,28 @@ def post_detail_404(request, context): def posts_with_results_key(request, context): return paginate_response(request, generate_posts(), records_key="many-results") + @router.post(r"/posts/search$") + def search_posts(request, context): + body = request.json() + page_size = body.get("page_size", 10) + page_number = body.get("page", 1) + + # Simulate a search with filtering + records = generate_posts() + ids_greater_than = body.get("ids_greater_than", 0) + records = [r for r in records if r["id"] > ids_greater_than] + + total_records = len(records) + total_pages = (total_records + page_size - 1) // page_size + start_index = (page_number - 1) * page_size + end_index = start_index + page_size + records_slice = records[start_index:end_index] + + return { + "data": records_slice, + "next_page": page_number + 1 if page_number < total_pages else None, + } + @router.get("/protected/posts/basic-auth") def protected_basic_auth(request, context): auth = request.headers.get("Authorization") diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index aa3f02e51d..35bcbd5279 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -22,7 +22,7 @@ ) from dlt.sources.helpers.rest_client.client import Hooks from dlt.sources.helpers.rest_client.exceptions import IgnoreResponseException -from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator +from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator, BaseReferencePaginator from .conftest import assert_pagination @@ -394,3 +394,34 @@ def _fake_send(*args, **kwargs): result = rest_client.get("/posts/1") assert result.status_code == 200 + + def test_paginate_json_body_without_params(self, rest_client) -> None: + class JSONBodyPageCursorPaginator(BaseReferencePaginator): + def update_state(self, response): + self._next_reference = response.json().get("next_page") + + def update_request(self, request): + if request.json is None: + request.json = {} + + request.json["page"] = self._next_reference + + page_generator = rest_client.paginate( + path="/posts/search", + method="POST", + json={"ids_greater_than": 50}, + paginator=JSONBodyPageCursorPaginator(), + ) + result = [post for page in list(page_generator) for post in page] + for i in range(49): + assert result[i] == {"id": 51 + i, "title": f"Post {51 + i}"} + + def test_post_json_body_without_params(self, rest_client) -> None: + result = rest_client.post( + path="/posts/search", + json={"ids_greater_than": 50}, + ) + returned_posts = result.json()["data"] + assert len(returned_posts) == 10 # only one page is returned + for i in range(10): + assert returned_posts[i] == {"id": 51 + i, "title": f"Post {51 + i}"} From bbaccbfd520cf057e2a57dc966367213a1899add Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 5 Jul 2024 15:19:05 +0200 Subject: [PATCH 56/61] RESTClient: add integrations tests for paginators (#1509) * Refactor mock_api_server, add integration tests for header link, json link and offset paginator * Fix assert_pagination * Rename a param * Fix off-by-one error in PageNumberPaginator; add pagination tests * Rename the index_base param; add mock api tests * Fixed a test condition * Fix formatting * Add default page size and total pages * Implement tests for JSONResponseCursorPaginator; extend tests for mock_api_server * Rename an arg in the doctring * Rename the index_base parameter * Update the docs --- dlt/sources/helpers/rest_client/paginators.py | 22 +- .../docs/general-usage/http/rest-client.md | 5 +- .../sources/helpers/rest_client/api_router.py | 61 ++++ tests/sources/helpers/rest_client/conftest.py | 205 +++++------- .../sources/helpers/rest_client/paginators.py | 125 +++++++ .../rest_client/test_mock_api_server.py | 310 ++++++++++++++++++ .../helpers/rest_client/test_paginators.py | 123 ++++++- 7 files changed, 723 insertions(+), 128 deletions(-) create mode 100644 tests/sources/helpers/rest_client/api_router.py create mode 100644 tests/sources/helpers/rest_client/paginators.py create mode 100644 tests/sources/helpers/rest_client/test_mock_api_server.py diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index b6702797e9..701f0c914b 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -91,6 +91,7 @@ def __init__( param_name: str, initial_value: int, value_step: int, + base_index: int = 0, maximum_value: Optional[int] = None, total_path: Optional[jsonpath.TJsonPath] = None, error_message_items: str = "items", @@ -101,6 +102,8 @@ def __init__( For example, 'page'. initial_value (int): The initial value of the numeric parameter. value_step (int): The step size to increment the numeric parameter. + base_index (int, optional): The index of the initial element. + Used to define 0-based or 1-based indexing. Defaults to 0. maximum_value (int, optional): The maximum value for the numeric parameter. If provided, pagination will stop once this value is reached or exceeded, even if more data is available. This allows you @@ -119,6 +122,7 @@ def __init__( self.param_name = param_name self.current_value = initial_value self.value_step = value_step + self.base_index = base_index self.maximum_value = maximum_value self.total_path = jsonpath.compile_path(total_path) if total_path else None self.error_message_items = error_message_items @@ -145,7 +149,7 @@ def update_state(self, response: Response) -> None: self.current_value += self.value_step - if (total is not None and self.current_value >= total) or ( + if (total is not None and self.current_value >= total + self.base_index) or ( self.maximum_value is not None and self.current_value >= self.maximum_value ): self._has_next_page = False @@ -219,14 +223,20 @@ def get_items(): def __init__( self, - initial_page: int = 0, + base_page: int = 0, + page: int = None, page_param: str = "page", total_path: jsonpath.TJsonPath = "total", maximum_page: Optional[int] = None, ): """ Args: - initial_page (int): The initial page number. + base_page (int): The index of the initial page from the API perspective. + Determines the page number that the API server uses for the starting + page. Normally, this is 0-based or 1-based (e.g., 1, 2, 3, ...) + indexing for the pages. Defaults to 0. + page (int): The page number for the first request. If not provided, + the initial value will be set to `base_page`. page_param (str): The query parameter name for the page number. Defaults to 'page'. total_path (jsonpath.TJsonPath): The JSONPath expression for @@ -238,9 +248,13 @@ def __init__( """ if total_path is None and maximum_page is None: raise ValueError("Either `total_path` or `maximum_page` must be provided.") + + page = page if page is not None else base_page + super().__init__( param_name=page_param, - initial_value=initial_page, + initial_value=page, + base_index=base_page, total_path=total_path, value_step=1, maximum_value=maximum_page, diff --git a/docs/website/docs/general-usage/http/rest-client.md b/docs/website/docs/general-usage/http/rest-client.md index 3a7276a534..61af3d2057 100644 --- a/docs/website/docs/general-usage/http/rest-client.md +++ b/docs/website/docs/general-usage/http/rest-client.md @@ -164,7 +164,7 @@ def get_data(): #### HeaderLinkPaginator -This paginator handles pagination based on a link to the next page in the response headers (e.g., the `Link` header, as used by GitHub). +This paginator handles pagination based on a link to the next page in the response headers (e.g., the `Link` header, as used by GitHub API). **Parameters:** @@ -231,7 +231,8 @@ Note, that in this case, the `total_path` parameter is set explicitly to `None` **Parameters:** -- `initial_page`: The starting page number. Defaults to `1`. +- `base_page`: The index of the initial page from the API perspective. Normally, it's 0-based or 1-based (e.g., 1, 2, 3, ...) indexing for the pages. Defaults to 0. +- `page`: The page number for the first request. If not provided, the initial value will be set to `base_page`. - `page_param`: The query parameter name for the page number. Defaults to `"page"`. - `total_path`: A JSONPath expression for the total number of pages. If not provided, pagination is controlled by `maximum_page`. - `maximum_page`: Optional maximum page number. Stops pagination once this page is reached. diff --git a/tests/sources/helpers/rest_client/api_router.py b/tests/sources/helpers/rest_client/api_router.py new file mode 100644 index 0000000000..661a4d3468 --- /dev/null +++ b/tests/sources/helpers/rest_client/api_router.py @@ -0,0 +1,61 @@ +import re +from typing import NamedTuple, Callable, Pattern, List, Union, TYPE_CHECKING, Dict, Any + +import requests_mock + +from dlt.common import json + +if TYPE_CHECKING: + RequestCallback = Callable[ + [requests_mock.Request, requests_mock.Context], Union[str, Dict[str, Any], List[Any]] + ] + ResponseSerializer = Callable[[requests_mock.Request, requests_mock.Context], str] +else: + RequestCallback = Callable + ResponseSerializer = Callable + + +class Route(NamedTuple): + method: str + pattern: Pattern[str] + callback: ResponseSerializer + + +class APIRouter: + def __init__(self, base_url: str): + self.routes: List[Route] = [] + self.base_url = base_url + + def _add_route(self, method: str, pattern: str, func: RequestCallback) -> RequestCallback: + compiled_pattern = re.compile(f"{self.base_url}{pattern}") + + def serialize_response(request, context): + result = func(request, context) + + if isinstance(result, dict) or isinstance(result, list): + return json.dumps(result) + + return result + + self.routes.append(Route(method, compiled_pattern, serialize_response)) + return serialize_response + + def get(self, pattern: str) -> Callable[[RequestCallback], RequestCallback]: + def decorator(func: RequestCallback) -> RequestCallback: + return self._add_route("GET", pattern, func) + + return decorator + + def post(self, pattern: str) -> Callable[[RequestCallback], RequestCallback]: + def decorator(func: RequestCallback) -> RequestCallback: + return self._add_route("POST", pattern, func) + + return decorator + + def register_routes(self, mocker: requests_mock.Mocker) -> None: + for route in self.routes: + mocker.register_uri( + route.method, + route.pattern, + text=route.callback, + ) diff --git a/tests/sources/helpers/rest_client/conftest.py b/tests/sources/helpers/rest_client/conftest.py index b56ef6bbbf..e8367bbe51 100644 --- a/tests/sources/helpers/rest_client/conftest.py +++ b/tests/sources/helpers/rest_client/conftest.py @@ -1,156 +1,119 @@ -import re -from typing import NamedTuple, Callable, Pattern, Union, TYPE_CHECKING, Dict, List, Any import base64 - -from urllib.parse import parse_qs, urlsplit, urlunsplit +from urllib.parse import parse_qs, urlsplit, urlunsplit, urlencode import pytest import requests_mock -from dlt.common import json - -if TYPE_CHECKING: - RequestCallback = Callable[ - [requests_mock.Request, requests_mock.Context], Union[str, Dict[str, Any], List[Any]] - ] - ResponseSerializer = Callable[[requests_mock.Request, requests_mock.Context], str] -else: - RequestCallback = Callable - ResponseSerializer = Callable - -MOCK_BASE_URL = "https://api.example.com" - - -class Route(NamedTuple): - method: str - pattern: Pattern[str] - callback: ResponseSerializer +from dlt.sources.helpers.rest_client import RESTClient +from .api_router import APIRouter +from .paginators import PageNumberPaginator, OffsetPaginator, CursorPaginator -class APIRouter: - def __init__(self, base_url: str): - self.routes: List[Route] = [] - self.base_url = base_url - def _add_route(self, method: str, pattern: str, func: RequestCallback) -> RequestCallback: - compiled_pattern = re.compile(f"{self.base_url}{pattern}") - - def serialize_response(request, context): - result = func(request, context) +MOCK_BASE_URL = "https://api.example.com" +DEFAULT_PAGE_SIZE = 5 +DEFAULT_TOTAL_PAGES = 5 +DEFAULT_LIMIT = 10 - if isinstance(result, dict) or isinstance(result, list): - return json.dumps(result) - return result +router = APIRouter(MOCK_BASE_URL) - self.routes.append(Route(method, compiled_pattern, serialize_response)) - return serialize_response - def get(self, pattern: str) -> Callable[[RequestCallback], RequestCallback]: - def decorator(func: RequestCallback) -> RequestCallback: - return self._add_route("GET", pattern, func) +def generate_posts(count=DEFAULT_PAGE_SIZE * DEFAULT_TOTAL_PAGES): + return [{"id": i, "title": f"Post {i}"} for i in range(count)] - return decorator - def post(self, pattern: str) -> Callable[[RequestCallback], RequestCallback]: - def decorator(func: RequestCallback) -> RequestCallback: - return self._add_route("POST", pattern, func) +def generate_comments(post_id, count=50): + return [{"id": i, "body": f"Comment {i} for post {post_id}"} for i in range(count)] - return decorator - def register_routes(self, mocker: requests_mock.Mocker) -> None: - for route in self.routes: - mocker.register_uri( - route.method, - route.pattern, - text=route.callback, - ) +def get_page_number(qs, key="page", default=1): + return int(qs.get(key, [default])[0]) -router = APIRouter(MOCK_BASE_URL) +def create_next_page_url(request, paginator, use_absolute_url=True): + scheme, netloc, path, _, _ = urlsplit(request.url) + query = urlencode(paginator.next_page_url_params) + if use_absolute_url: + return urlunsplit([scheme, netloc, path, query, ""]) + else: + return f"{path}?{query}" -def serialize_page( - records, - page_number, - total_pages, - request_url, - records_key="data", - use_absolute_url=True, +def paginate_by_page_number( + request, records, records_key="data", use_absolute_url=True, index_base=1 ): - """Serialize a page of records into a dict with pagination metadata.""" - if records_key is None: - return records + page_number = get_page_number(request.qs, default=index_base) + paginator = PageNumberPaginator(records, page_number, index_base=index_base) response = { - records_key: records, - "page": page_number, - "total_pages": total_pages, + records_key: paginator.page_records, + **paginator.metadata, } - if page_number < total_pages: - next_page = page_number + 1 - - scheme, netloc, path, _, _ = urlsplit(request_url) - if use_absolute_url: - next_page_url = urlunsplit([scheme, netloc, path, f"page={next_page}", ""]) - else: - next_page_url = f"{path}?page={next_page}" - - response["next_page"] = next_page_url + if paginator.next_page_url_params: + response["next_page"] = create_next_page_url(request, paginator, use_absolute_url) return response -def generate_posts(count=100): - return [{"id": i, "title": f"Post {i}"} for i in range(count)] +@pytest.fixture(scope="module") +def mock_api_server(): + with requests_mock.Mocker() as m: + @router.get(r"/posts(\?page=\d+)?$") + def posts(request, context): + return paginate_by_page_number(request, generate_posts()) -def generate_comments(post_id, count=50): - return [{"id": i, "body": f"Comment {i} for post {post_id}"} for i in range(count)] + @router.get(r"/posts_zero_based(\?page=\d+)?$") + def posts_zero_based(request, context): + return paginate_by_page_number(request, generate_posts(), index_base=0) + @router.get(r"/posts_header_link(\?page=\d+)?$") + def posts_header_link(request, context): + records = generate_posts() + page_number = get_page_number(request.qs) + paginator = PageNumberPaginator(records, page_number) -def get_page_number(qs, key="page", default=1): - return int(qs.get(key, [default])[0]) + response = paginator.page_records + if paginator.next_page_url_params: + next_page_url = create_next_page_url(request, paginator) + context.headers["Link"] = f'<{next_page_url}>; rel="next"' -def paginate_response(request, records, page_size=10, records_key="data", use_absolute_url=True): - page_number = get_page_number(request.qs) - total_records = len(records) - total_pages = (total_records + page_size - 1) // page_size - start_index = (page_number - 1) * 10 - end_index = start_index + 10 - records_slice = records[start_index:end_index] - return serialize_page( - records_slice, - page_number, - total_pages, - request.url, - records_key, - use_absolute_url, - ) + return response + @router.get(r"/posts_relative_next_url(\?page=\d+)?$") + def posts_relative_next_url(request, context): + return paginate_by_page_number(request, generate_posts(), use_absolute_url=False) -@pytest.fixture(scope="module") -def mock_api_server(): - with requests_mock.Mocker() as m: + @router.get(r"/posts_offset_limit(\?offset=\d+&limit=\d+)?$") + def posts_offset_limit(request, context): + records = generate_posts() + offset = int(request.qs.get("offset", [0])[0]) + limit = int(request.qs.get("limit", [DEFAULT_LIMIT])[0]) + paginator = OffsetPaginator(records, offset, limit) - @router.get(r"/posts_no_key(\?page=\d+)?$") - def posts_no_key(request, context): - return paginate_response(request, generate_posts(), records_key=None) + return { + "data": paginator.page_records, + **paginator.metadata, + } - @router.get(r"/posts(\?page=\d+)?$") - def posts(request, context): - return paginate_response(request, generate_posts()) + @router.get(r"/posts_cursor(\?cursor=\d+)?$") + def posts_cursor(request, context): + records = generate_posts() + cursor = int(request.qs.get("cursor", [0])[0]) + paginator = CursorPaginator(records, cursor) - @router.get(r"/posts_relative_next_url(\?page=\d+)?$") - def posts_relative_next_url(request, context): - return paginate_response(request, generate_posts(), use_absolute_url=False) + return { + "data": paginator.page_records, + **paginator.metadata, + } @router.get(r"/posts/(\d+)/comments") def post_comments(request, context): post_id = int(request.url.split("/")[-2]) - return paginate_response(request, generate_comments(post_id)) + return paginate_by_page_number(request, generate_comments(post_id)) @router.get(r"/posts/\d+$") def post_detail(request, context): @@ -169,7 +132,7 @@ def post_detail_404(request, context): @router.get(r"/posts_under_a_different_key$") def posts_with_results_key(request, context): - return paginate_response(request, generate_posts(), records_key="many-results") + return paginate_by_page_number(request, generate_posts(), records_key="many-results") @router.post(r"/posts/search$") def search_posts(request, context): @@ -199,7 +162,7 @@ def protected_basic_auth(request, context): creds = "user:password" creds_base64 = base64.b64encode(creds.encode()).decode() if auth == f"Basic {creds_base64}": - return paginate_response(request, generate_posts()) + return paginate_by_page_number(request, generate_posts()) context.status_code = 401 return {"error": "Unauthorized"} @@ -207,7 +170,7 @@ def protected_basic_auth(request, context): def protected_bearer_token(request, context): auth = request.headers.get("Authorization") if auth == "Bearer test-token": - return paginate_response(request, generate_posts()) + return paginate_by_page_number(request, generate_posts()) context.status_code = 401 return {"error": "Unauthorized"} @@ -215,7 +178,7 @@ def protected_bearer_token(request, context): def protected_bearer_token_plain_text_erorr(request, context): auth = request.headers.get("Authorization") if auth == "Bearer test-token": - return paginate_response(request, generate_posts()) + return paginate_by_page_number(request, generate_posts()) context.status_code = 401 return "Unauthorized" @@ -223,7 +186,7 @@ def protected_bearer_token_plain_text_erorr(request, context): def protected_api_key(request, context): api_key = request.headers.get("x-api-key") if api_key == "test-api-key": - return paginate_response(request, generate_posts()) + return paginate_by_page_number(request, generate_posts()) context.status_code = 401 return {"error": "Unauthorized"} @@ -267,6 +230,14 @@ def custom_oauth_token(request, context): yield m +@pytest.fixture +def rest_client() -> RESTClient: + return RESTClient( + base_url="https://api.example.com", + headers={"Accept": "application/json"}, + ) + + def oauth_authorize(request): qs = parse_qs(request.text) grant_type = qs.get("grant_type")[0] @@ -279,7 +250,9 @@ def oauth_authorize(request): ) -def assert_pagination(pages, expected_start=0, page_size=10, total_pages=10): +def assert_pagination(pages, page_size=DEFAULT_PAGE_SIZE, total_pages=DEFAULT_TOTAL_PAGES): assert len(pages) == total_pages for i, page in enumerate(pages): - assert page == [{"id": i, "title": f"Post {i}"} for i in range(i * 10, (i + 1) * 10)] + assert page == [ + {"id": i, "title": f"Post {i}"} for i in range(i * page_size, (i + 1) * page_size) + ] diff --git a/tests/sources/helpers/rest_client/paginators.py b/tests/sources/helpers/rest_client/paginators.py new file mode 100644 index 0000000000..fdd8e6f4d8 --- /dev/null +++ b/tests/sources/helpers/rest_client/paginators.py @@ -0,0 +1,125 @@ +class BasePaginator: + def __init__(self, records): + self.records = records + + @property + def page_records(self): + """Return records for the current page.""" + raise NotImplementedError + + @property + def metadata(self): + """Return metadata for the current page. + E.g. total number of records, current page number, etc. + """ + raise NotImplementedError + + @property + def next_page_url_params(self): + """Return URL parameters for the next page. + This is used to generate the URL for the next page in the response. + """ + raise NotImplementedError + + +class PageNumberPaginator(BasePaginator): + def __init__(self, records, page_number, page_size=5, index_base=1): + """Paginate records by page number. + + Args: + records: List of records to paginate. + page_number: Page number to return. + page_size: Maximum number of records to return per page. + index_base: Index of the start page. E.g. zero-based + index or 1-based index. + """ + super().__init__(records) + self.page_number = page_number + self.index_base = index_base + self.page_size = page_size + + @property + def page_records(self): + start_index = (self.page_number - self.index_base) * self.page_size + end_index = start_index + self.page_size + return self.records[start_index:end_index] + + @property + def metadata(self): + return {"page": self.page_number, "total_pages": self.total_pages} + + @property + def next_page_url_params(self): + return {"page": self.next_page_number} if self.next_page_number else {} + + @property + def total_pages(self): + total_records = len(self.records) + return (total_records + self.page_size - 1) // self.page_size + + @property + def next_page_number(self): + return ( + self.page_number + 1 + if self.page_number + 1 < self.total_pages + self.index_base + else None + ) + + +class OffsetPaginator(BasePaginator): + def __init__(self, records, offset, limit=10): + """Paginate records by offset. + + Args: + records: List of records to paginate. + offset: Offset to start slicing from. + limit: Maximum number of records to return. + """ + super().__init__(records) + self.offset = offset + self.limit = limit + + @property + def page_records(self): + return self.records[self.offset : self.offset + self.limit] + + @property + def metadata(self): + return {"total_records": len(self.records), "offset": self.offset, "limit": self.limit} + + @property + def next_page_url_params(self): + if self.offset + self.limit < len(self.records): + return {"offset": self.offset + self.limit, "limit": self.limit} + return {} + + +class CursorPaginator(BasePaginator): + def __init__(self, records, cursor, limit=5): + """Paginate records by cursor. + + Here, cursor is the index of the record to start slicing from. + + Args: + records: List of records to paginate. + cursor: Cursor to start slicing from. + limit: Maximum number of records to return. + """ + super().__init__(records) + self.cursor = cursor + self.limit = limit + + @property + def page_records(self): + return self.records[self.cursor : self.cursor + self.limit] + + @property + def metadata(self): + next_index = self.cursor + self.limit + + if next_index < len(self.records): + next_cursor = next_index + else: + next_cursor = None + + return {"next_cursor": next_cursor} diff --git a/tests/sources/helpers/rest_client/test_mock_api_server.py b/tests/sources/helpers/rest_client/test_mock_api_server.py new file mode 100644 index 0000000000..cfdc920f3e --- /dev/null +++ b/tests/sources/helpers/rest_client/test_mock_api_server.py @@ -0,0 +1,310 @@ +import pytest + + +@pytest.mark.usefixtures("mock_api_server") +class TestMockAPIServer: + @pytest.mark.parametrize( + "test_case", + [ + # Page number is one-based + { + "url": "/posts", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 0, "title": "Post 0"}, + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"}, + {"id": 4, "title": "Post 4"}, + ], + "page": 1, + "total_pages": 5, + "next_page": "https://api.example.com/posts?page=2", + }, + }, + { + "url": "/posts?page=2", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 5, "title": "Post 5"}, + {"id": 6, "title": "Post 6"}, + {"id": 7, "title": "Post 7"}, + {"id": 8, "title": "Post 8"}, + {"id": 9, "title": "Post 9"}, + ], + "page": 2, + "total_pages": 5, + "next_page": "https://api.example.com/posts?page=3", + }, + }, + { + "url": "/posts?page=3", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 10, "title": "Post 10"}, + {"id": 11, "title": "Post 11"}, + {"id": 12, "title": "Post 12"}, + {"id": 13, "title": "Post 13"}, + {"id": 14, "title": "Post 14"}, + ], + "page": 3, + "total_pages": 5, + "next_page": "https://api.example.com/posts?page=4", + }, + }, + { + "url": "/posts?page=4", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 15, "title": "Post 15"}, + {"id": 16, "title": "Post 16"}, + {"id": 17, "title": "Post 17"}, + {"id": 18, "title": "Post 18"}, + {"id": 19, "title": "Post 19"}, + ], + "page": 4, + "total_pages": 5, + "next_page": "https://api.example.com/posts?page=5", + }, + }, + { + "url": "/posts?page=5", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 20, "title": "Post 20"}, + {"id": 21, "title": "Post 21"}, + {"id": 22, "title": "Post 22"}, + {"id": 23, "title": "Post 23"}, + {"id": 24, "title": "Post 24"}, + ], + "page": 5, + "total_pages": 5, + }, + }, + # Page number is zero-based + { + "url": "/posts_zero_based", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 0, "title": "Post 0"}, + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"}, + {"id": 4, "title": "Post 4"}, + ], + "page": 0, + "total_pages": 5, + "next_page": "https://api.example.com/posts_zero_based?page=1", + }, + }, + { + "url": "/posts_zero_based?page=1", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 5, "title": "Post 5"}, + {"id": 6, "title": "Post 6"}, + {"id": 7, "title": "Post 7"}, + {"id": 8, "title": "Post 8"}, + {"id": 9, "title": "Post 9"}, + ], + "page": 1, + "total_pages": 5, + "next_page": "https://api.example.com/posts_zero_based?page=2", + }, + }, + { + "url": "/posts_zero_based?page=2", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 10, "title": "Post 10"}, + {"id": 11, "title": "Post 11"}, + {"id": 12, "title": "Post 12"}, + {"id": 13, "title": "Post 13"}, + {"id": 14, "title": "Post 14"}, + ], + "page": 2, + "total_pages": 5, + "next_page": "https://api.example.com/posts_zero_based?page=3", + }, + }, + { + "url": "/posts_zero_based?page=3", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 15, "title": "Post 15"}, + {"id": 16, "title": "Post 16"}, + {"id": 17, "title": "Post 17"}, + {"id": 18, "title": "Post 18"}, + {"id": 19, "title": "Post 19"}, + ], + "page": 3, + "total_pages": 5, + "next_page": "https://api.example.com/posts_zero_based?page=4", + }, + }, + { + "url": "/posts_zero_based?page=4", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 20, "title": "Post 20"}, + {"id": 21, "title": "Post 21"}, + {"id": 22, "title": "Post 22"}, + {"id": 23, "title": "Post 23"}, + {"id": 24, "title": "Post 24"}, + ], + "page": 4, + "total_pages": 5, + }, + }, + # Test offset-limit pagination + { + "url": "/posts_offset_limit", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 0, "title": "Post 0"}, + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"}, + {"id": 4, "title": "Post 4"}, + {"id": 5, "title": "Post 5"}, + {"id": 6, "title": "Post 6"}, + {"id": 7, "title": "Post 7"}, + {"id": 8, "title": "Post 8"}, + {"id": 9, "title": "Post 9"}, + ], + "total_records": 25, + "offset": 0, + "limit": 10, + }, + }, + { + "url": "/posts_offset_limit?offset=10&limit=10", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 10, "title": "Post 10"}, + {"id": 11, "title": "Post 11"}, + {"id": 12, "title": "Post 12"}, + {"id": 13, "title": "Post 13"}, + {"id": 14, "title": "Post 14"}, + {"id": 15, "title": "Post 15"}, + {"id": 16, "title": "Post 16"}, + {"id": 17, "title": "Post 17"}, + {"id": 18, "title": "Post 18"}, + {"id": 19, "title": "Post 19"}, + ], + "total_records": 25, + "offset": 10, + "limit": 10, + }, + }, + { + "url": "/posts_offset_limit?offset=20&limit=10", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 20, "title": "Post 20"}, + {"id": 21, "title": "Post 21"}, + {"id": 22, "title": "Post 22"}, + {"id": 23, "title": "Post 23"}, + {"id": 24, "title": "Post 24"}, + ], + "total_records": 25, + "offset": 20, + "limit": 10, + }, + }, + # Test cursor pagination + { + "url": "/posts_cursor", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 0, "title": "Post 0"}, + {"id": 1, "title": "Post 1"}, + {"id": 2, "title": "Post 2"}, + {"id": 3, "title": "Post 3"}, + {"id": 4, "title": "Post 4"}, + ], + "next_cursor": 5, + }, + }, + { + "url": "/posts_cursor?cursor=5", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 5, "title": "Post 5"}, + {"id": 6, "title": "Post 6"}, + {"id": 7, "title": "Post 7"}, + {"id": 8, "title": "Post 8"}, + {"id": 9, "title": "Post 9"}, + ], + "next_cursor": 10, + }, + }, + { + "url": "/posts_cursor?cursor=10", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 10, "title": "Post 10"}, + {"id": 11, "title": "Post 11"}, + {"id": 12, "title": "Post 12"}, + {"id": 13, "title": "Post 13"}, + {"id": 14, "title": "Post 14"}, + ], + "next_cursor": 15, + }, + }, + { + "url": "/posts_cursor?cursor=15", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 15, "title": "Post 15"}, + {"id": 16, "title": "Post 16"}, + {"id": 17, "title": "Post 17"}, + {"id": 18, "title": "Post 18"}, + {"id": 19, "title": "Post 19"}, + ], + "next_cursor": 20, + }, + }, + { + "url": "/posts_cursor?cursor=20", + "status_code": 200, + "expected_json": { + "data": [ + {"id": 20, "title": "Post 20"}, + {"id": 21, "title": "Post 21"}, + {"id": 22, "title": "Post 22"}, + {"id": 23, "title": "Post 23"}, + {"id": 24, "title": "Post 24"}, + ], + "next_cursor": None, + }, + }, + ], + ) + def test_paginate_success(self, test_case, rest_client): + response = rest_client.get(test_case["url"]) + assert response.status_code == test_case["status_code"] + assert response.json() == test_case["expected_json"] + + @pytest.mark.skip(reason="Not implemented") + def test_paginate_by_page_number_invalid_page(self, rest_client): + response = rest_client.get("/posts?page=6") + assert response.status_code == 404 + assert response.json() == {"error": "Not Found"} diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index e5d31c52d2..a5f9d888a2 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -11,9 +11,13 @@ PageNumberPaginator, HeaderLinkPaginator, JSONResponsePaginator, + JSONResponseCursorPaginator, ) +from .conftest import assert_pagination + +@pytest.mark.usefixtures("mock_api_server") class TestHeaderLinkPaginator: def test_update_state_with_next(self): paginator = HeaderLinkPaginator() @@ -30,7 +34,18 @@ def test_update_state_without_next(self): paginator.update_state(response) assert paginator.has_next_page is False + def test_client_pagination(self, rest_client): + pages_iter = rest_client.paginate( + "/posts_header_link", + paginator=HeaderLinkPaginator(), + ) + + pages = list(pages_iter) + assert_pagination(pages) + + +@pytest.mark.usefixtures("mock_api_server") class TestJSONResponsePaginator: @pytest.mark.parametrize( "test_case", @@ -182,7 +197,20 @@ def test_no_duplicate_params_on_update_request(self): # The next request should just use the "next" URL without any duplicate parameters. assert prepared_request.url == "http://example.com/api/resource?page=2¶m1=value1" + def test_client_pagination(self, rest_client): + pages_iter = rest_client.paginate( + "/posts", + paginator=JSONResponsePaginator( + next_url_path="next_page", + ), + ) + + pages = list(pages_iter) + + assert_pagination(pages) + +@pytest.mark.usefixtures("mock_api_server") class TestSinglePagePaginator: def test_update_state(self): paginator = SinglePagePaginator() @@ -197,7 +225,18 @@ def test_update_state_with_next(self): paginator.update_state(response) assert paginator.has_next_page is False + def test_client_pagination(self, rest_client): + pages_iter = rest_client.paginate( + "/posts", + paginator=SinglePagePaginator(), + ) + + pages = list(pages_iter) + + assert_pagination(pages, total_pages=1) + +@pytest.mark.usefixtures("mock_api_server") class TestOffsetPaginator: def test_update_state(self): paginator = OffsetPaginator(offset=0, limit=10) @@ -263,40 +302,55 @@ def test_maximum_offset(self): assert paginator.current_value == 100 assert paginator.has_next_page is False + def test_client_pagination(self, rest_client): + pages_iter = rest_client.paginate( + "/posts_offset_limit", + paginator=OffsetPaginator(offset=0, limit=5, total_path="total_records"), + ) + + pages = list(pages_iter) + assert_pagination(pages) + + +@pytest.mark.usefixtures("mock_api_server") class TestPageNumberPaginator: def test_update_state(self): - paginator = PageNumberPaginator(initial_page=1, total_path="total_pages") + paginator = PageNumberPaginator(base_page=1, page=1, total_path="total_pages") response = Mock(Response, json=lambda: {"total_pages": 3}) paginator.update_state(response) assert paginator.current_value == 2 assert paginator.has_next_page is True + paginator.update_state(response) + assert paginator.current_value == 3 + assert paginator.has_next_page is True + # Test for reaching the end paginator.update_state(response) assert paginator.has_next_page is False def test_update_state_with_string_total_pages(self): - paginator = PageNumberPaginator(1) + paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total": "3"}) paginator.update_state(response) assert paginator.current_value == 2 assert paginator.has_next_page is True def test_update_state_with_invalid_total_pages(self): - paginator = PageNumberPaginator(1) + paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {"total_pages": "invalid"}) with pytest.raises(ValueError): paginator.update_state(response) def test_update_state_without_total_pages(self): - paginator = PageNumberPaginator(1) + paginator = PageNumberPaginator(base_page=1, page=1) response = Mock(Response, json=lambda: {}) with pytest.raises(ValueError): paginator.update_state(response) def test_update_request(self): - paginator = PageNumberPaginator(initial_page=1, page_param="page") + paginator = PageNumberPaginator(base_page=1, page=1, page_param="page") request = Mock(Request) response = Mock(Response, json=lambda: {"total": 3}) paginator.update_state(response) @@ -308,7 +362,7 @@ def test_update_request(self): assert request.params["page"] == 3 def test_maximum_page(self): - paginator = PageNumberPaginator(initial_page=1, maximum_page=3, total_path=None) + paginator = PageNumberPaginator(base_page=1, page=1, maximum_page=3, total_path=None) response = Mock(Response, json=lambda: {"items": []}) paginator.update_state(response) # Page 1 assert paginator.current_value == 2 @@ -317,3 +371,60 @@ def test_maximum_page(self): paginator.update_state(response) # Page 2 assert paginator.current_value == 3 assert paginator.has_next_page is False + + def test_client_pagination_one_based(self, rest_client): + pages_iter = rest_client.paginate( + "/posts", + paginator=PageNumberPaginator(base_page=1, page=1, total_path="total_pages"), + ) + + pages = list(pages_iter) + + assert_pagination(pages) + + def test_client_pagination_one_based_default_page(self, rest_client): + pages_iter = rest_client.paginate( + "/posts", + paginator=PageNumberPaginator(base_page=1, total_path="total_pages"), + ) + + pages = list(pages_iter) + + assert_pagination(pages) + + def test_client_pagination_zero_based(self, rest_client): + pages_iter = rest_client.paginate( + "/posts_zero_based", + paginator=PageNumberPaginator(base_page=0, page=0, total_path="total_pages"), + ) + + pages = list(pages_iter) + + assert_pagination(pages) + + +@pytest.mark.usefixtures("mock_api_server") +class TestJSONResponseCursorPaginator: + def test_update_state(self): + paginator = JSONResponseCursorPaginator(cursor_path="next_cursor") + response = Mock(Response, json=lambda: {"next_cursor": "cursor-2", "results": []}) + paginator.update_state(response) + assert paginator._next_reference == "cursor-2" + assert paginator.has_next_page is True + + def test_update_request(self): + paginator = JSONResponseCursorPaginator(cursor_path="next_cursor") + paginator._next_reference = "cursor-2" + request = Request(method="GET", url="http://example.com/api/resource") + paginator.update_request(request) + assert request.params["cursor"] == "cursor-2" + + def test_client_pagination(self, rest_client): + pages_iter = rest_client.paginate( + "/posts_cursor", + paginator=JSONResponseCursorPaginator(cursor_path="next_cursor"), + ) + + pages = list(pages_iter) + + assert_pagination(pages) From 18eec65d35cec0fc855e3f6c32d22a69d622c95a Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Fri, 5 Jul 2024 19:11:03 +0200 Subject: [PATCH 57/61] fixes rest post method client tests --- dlt/load/load.py | 4 +++- tests/sources/helpers/rest_client/conftest.py | 2 +- .../helpers/rest_client/test_client.py | 21 ++++++++++++------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/dlt/load/load.py b/dlt/load/load.py index 0e78650a84..a8dfb7002e 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -197,7 +197,9 @@ def spool_new_jobs(self, load_id: str, schema: Schema) -> Tuple[int, List[LoadJo # use thread based pool as jobs processing is mostly I/O and we do not want to pickle jobs load_files = filter_new_jobs( self.load_storage.list_new_jobs(load_id), - self.destination.capabilities(self.destination.configuration(self.initial_client_config)), + self.destination.capabilities( + self.destination.configuration(self.initial_client_config) + ), self.config, ) file_count = len(load_files) diff --git a/tests/sources/helpers/rest_client/conftest.py b/tests/sources/helpers/rest_client/conftest.py index e8367bbe51..10dd23877d 100644 --- a/tests/sources/helpers/rest_client/conftest.py +++ b/tests/sources/helpers/rest_client/conftest.py @@ -137,7 +137,7 @@ def posts_with_results_key(request, context): @router.post(r"/posts/search$") def search_posts(request, context): body = request.json() - page_size = body.get("page_size", 10) + page_size = body.get("page_size", DEFAULT_PAGE_SIZE) page_number = body.get("page", 1) # Simulate a search with filtering diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 35bcbd5279..ed227cd3cd 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -24,7 +24,7 @@ from dlt.sources.helpers.rest_client.exceptions import IgnoreResponseException from dlt.sources.helpers.rest_client.paginators import JSONResponsePaginator, BaseReferencePaginator -from .conftest import assert_pagination +from .conftest import DEFAULT_PAGE_SIZE, DEFAULT_TOTAL_PAGES, assert_pagination def load_private_key(name="private_key.pem"): @@ -396,6 +396,9 @@ def _fake_send(*args, **kwargs): assert result.status_code == 200 def test_paginate_json_body_without_params(self, rest_client) -> None: + # leave 3 pages of data + posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE + class JSONBodyPageCursorPaginator(BaseReferencePaginator): def update_state(self, response): self._next_reference = response.json().get("next_page") @@ -409,19 +412,21 @@ def update_request(self, request): page_generator = rest_client.paginate( path="/posts/search", method="POST", - json={"ids_greater_than": 50}, + json={"ids_greater_than": posts_skip - 1}, paginator=JSONBodyPageCursorPaginator(), ) result = [post for page in list(page_generator) for post in page] - for i in range(49): - assert result[i] == {"id": 51 + i, "title": f"Post {51 + i}"} + for i in range(3 * DEFAULT_PAGE_SIZE): + assert result[i] == {"id": posts_skip + i, "title": f"Post {posts_skip + i}"} def test_post_json_body_without_params(self, rest_client) -> None: + # leave two pages of data + posts_skip = (DEFAULT_TOTAL_PAGES - 2) * DEFAULT_PAGE_SIZE result = rest_client.post( path="/posts/search", - json={"ids_greater_than": 50}, + json={"ids_greater_than": posts_skip - 1}, ) returned_posts = result.json()["data"] - assert len(returned_posts) == 10 # only one page is returned - for i in range(10): - assert returned_posts[i] == {"id": 51 + i, "title": f"Post {51 + i}"} + assert len(returned_posts) == DEFAULT_PAGE_SIZE # only one page is returned + for i in range(DEFAULT_PAGE_SIZE): + assert returned_posts[i] == {"id": posts_skip + i, "title": f"Post {posts_skip + i}"} From 42dea60ecde79c8043564aa180b79f1c585e1125 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Fri, 5 Jul 2024 19:56:45 +0200 Subject: [PATCH 58/61] selects all tables from info schema if number of tables > threshold (#1547) * selects all tables from info schema if number of tables to check is more than a threshold * adds tests --- dlt/destinations/impl/bigquery/bigquery.py | 11 +++---- dlt/destinations/job_client_impl.py | 35 ++++++++++++++++------ tests/conftest.py | 2 ++ tests/load/pipeline/test_pipelines.py | 35 +++++++++++++++++++--- 4 files changed, 65 insertions(+), 18 deletions(-) diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index d0052c22f0..6db9034e34 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -411,11 +411,12 @@ def _get_info_schema_columns_query( query = f""" SELECT {",".join(self._get_storage_table_query_columns())} FROM {catalog_name}.{schema_name}.INFORMATION_SCHEMA.COLUMNS -WHERE """ - - # placeholder for each table - table_placeholders = ",".join(["%s"] * len(folded_table_names)) - query += f"table_name IN ({table_placeholders}) ORDER BY table_name, ordinal_position;" +""" + if folded_table_names: + # placeholder for each table + table_placeholders = ",".join(["%s"] * len(folded_table_names)) + query += f"WHERE table_name IN ({table_placeholders}) " + query += "ORDER BY table_name, ordinal_position;" return query, folded_table_names diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 0a627bbdfb..1dfa24a4ca 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -6,6 +6,7 @@ from types import TracebackType from typing import ( Any, + ClassVar, List, Optional, Sequence, @@ -133,6 +134,9 @@ def state(self) -> TLoadJobState: class SqlJobClientBase(JobClientBase, WithStateSync): + INFO_TABLES_QUERY_THRESHOLD: ClassVar[int] = 1000 + """Fallback to querying all tables in the information schema if checking more than threshold""" + def __init__( self, schema: Schema, @@ -328,8 +332,17 @@ def get_storage_tables( f"One or more of tables in {table_names} after applying" f" {self.capabilities.casefold_identifier} produced a name collision." ) + # if we have more tables to lookup than a threshold, we prefer to filter them in code + if ( + len(name_lookup) > self.INFO_TABLES_QUERY_THRESHOLD + or len(",".join(folded_table_names)) > self.capabilities.max_query_length / 2 + ): + logger.info( + "Fallback to query all columns from INFORMATION_SCHEMA due to limited query length" + " or table threshold" + ) + folded_table_names = [] - # rows = self.sql_client.execute_sql(query, *db_params) query, db_params = self._get_info_schema_columns_query( catalog_name, schema_name, folded_table_names ) @@ -337,6 +350,9 @@ def get_storage_tables( prev_table: str = None storage_columns: TTableSchemaColumns = None for c in rows: + # if we are selecting all tables this is expected + if not folded_table_names and c[0] not in name_lookup: + continue # make sure that new table is known assert ( c[0] in name_lookup @@ -437,7 +453,7 @@ def _get_info_schema_columns_query( self, catalog_name: Optional[str], schema_name: str, folded_table_names: List[str] ) -> Tuple[str, List[Any]]: """Generates SQL to query INFORMATION_SCHEMA.COLUMNS for a set of tables in `folded_table_names`. Input identifiers must be already - in a form that can be passed to a query via db_params. `catalogue_name` is optional and when None, the part of query selecting it + in a form that can be passed to a query via db_params. `catalogue_name` and `folded_tableS_name` is optional and when None, the part of query selecting it is skipped. Returns: query and list of db_params tuple @@ -452,13 +468,14 @@ def _get_info_schema_columns_query( db_params.append(catalog_name) query += "table_catalog = %s AND " db_params.append(schema_name) - db_params = db_params + folded_table_names - # placeholder for each table - table_placeholders = ",".join(["%s"] * len(folded_table_names)) - query += ( - f"table_schema = %s AND table_name IN ({table_placeholders}) ORDER BY table_name," - " ordinal_position;" - ) + select_tables_clause = "" + # look for particular tables only when requested, otherwise return the full schema + if folded_table_names: + db_params = db_params + folded_table_names + # placeholder for each table + table_placeholders = ",".join(["%s"] * len(folded_table_names)) + select_tables_clause = f"AND table_name IN ({table_placeholders})" + query += f"table_schema = %s {select_tables_clause} ORDER BY table_name, ordinal_position;" return query, db_params diff --git a/tests/conftest.py b/tests/conftest.py index 669fd19c35..7ed546dfea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -114,3 +114,5 @@ def _create_pipeline_instance_id(self) -> str: # disable httpx request logging (too verbose when testing qdrant) logging.getLogger("httpx").setLevel("WARNING") + + logging.getLogger("airflow.models.variable").setLevel("CRITICAL") diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 5b3c158b6c..a35dfa7654 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -1,11 +1,11 @@ from copy import deepcopy import gzip import os -from typing import Any, Callable, Iterator, Tuple, List, cast +from typing import Any, Iterator, List, cast, Tuple, Callable import pytest +from unittest import mock import dlt - from dlt.common import json, sleep from dlt.common.pipeline import SupportsPipeline from dlt.common.destination import Destination @@ -14,13 +14,15 @@ from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME +from dlt.common.schema.utils import new_table from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id from dlt.destinations.exceptions import DatabaseUndefinedRelation from dlt.destinations import filesystem, redshift +from dlt.destinations.job_client_impl import SqlJobClientBase from dlt.extract.exceptions import ResourceNameMissing -from dlt.extract import DltSource +from dlt.extract.source import DltSource from dlt.pipeline.exceptions import ( CannotRestorePipelineException, PipelineConfigMissing, @@ -821,7 +823,6 @@ def test_snowflake_delete_file_after_copy(destination_config: DestinationTestCon assert_query_data(pipeline, f"SELECT value FROM {tbl_name}", ["a", None, None]) -# do not remove - it allows us to filter tests by destination @pytest.mark.parametrize( "destination_config", destinations_configs(default_sql_configs=True, all_staging_configs=True, file_format="parquet"), @@ -1057,6 +1058,32 @@ def table_3(make_data=False): assert len(cur.fetchall()) == 0 +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +def test_query_all_info_tables_fallback(destination_config: DestinationTestConfiguration) -> None: + pipeline = destination_config.setup_pipeline( + "parquet_test_" + uniq_id(), dataset_name="parquet_test_" + uniq_id() + ) + with mock.patch.object(SqlJobClientBase, "INFO_TABLES_QUERY_THRESHOLD", 0): + info = pipeline.run([1, 2, 3], table_name="digits_1") + assert_load_info(info) + # create empty table + client: SqlJobClientBase + # we must add it to schema + pipeline.default_schema._schema_tables["existing_table"] = new_table("existing_table") + with pipeline.destination_client() as client: # type: ignore[assignment] + sql = client._get_table_update_sql( + "existing_table", [{"name": "_id", "data_type": "bigint"}], False + ) + client.sql_client.execute_many(sql) + # remove it from schema + del pipeline.default_schema._schema_tables["existing_table"] + # store another table + info = pipeline.run([1, 2, 3], table_name="digits_2") + assert_data_table_counts(pipeline, {"digits_1": 3, "digits_2": 3}) + + # @pytest.mark.skip(reason="Finalize the test: compare some_data values to values from database") # @pytest.mark.parametrize( # "destination_config", From 184af2d16ae9b82875d859c7b7736df450a42f36 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Fri, 5 Jul 2024 20:21:34 +0200 Subject: [PATCH 59/61] bumps to alpha 0.5.1a1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6f21d17be7..fdbbf9ba37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.5.1a0" +version = "0.5.1a1" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] From 4ec728f973c236698ffcae9196fee2588202fb94 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Mon, 8 Jul 2024 15:20:47 +0200 Subject: [PATCH 60/61] adds missing docs / tests and configurable staging dataset name (#1555) * allows to configure staging dataset name * adds some missing docs on naming conventions * adds missing test cases * bumps dlt version to 0.5.1 * fixes linter and tests --- dlt/common/destination/capabilities.py | 7 +- dlt/common/destination/reference.py | 40 +++++-- dlt/common/schema/schema.py | 18 ++- dlt/destinations/impl/athena/athena.py | 12 +- dlt/destinations/impl/bigquery/bigquery.py | 1 + .../impl/bigquery/configuration.py | 3 + dlt/destinations/impl/bigquery/factory.py | 3 +- dlt/destinations/impl/bigquery/sql_client.py | 3 +- .../impl/clickhouse/clickhouse.py | 5 +- .../impl/clickhouse/sql_client.py | 3 +- .../impl/databricks/databricks.py | 5 +- .../impl/databricks/sql_client.py | 3 +- dlt/destinations/impl/dremio/dremio.py | 5 +- dlt/destinations/impl/dremio/sql_client.py | 3 +- dlt/destinations/impl/duckdb/duck.py | 5 +- dlt/destinations/impl/duckdb/sql_client.py | 3 +- .../impl/lancedb/lancedb_client.py | 6 +- .../impl/motherduck/motherduck.py | 5 +- .../impl/motherduck/sql_client.py | 3 +- dlt/destinations/impl/mssql/mssql.py | 7 +- dlt/destinations/impl/mssql/sql_client.py | 3 +- dlt/destinations/impl/postgres/factory.py | 1 - dlt/destinations/impl/postgres/postgres.py | 7 +- dlt/destinations/impl/postgres/sql_client.py | 3 +- dlt/destinations/impl/qdrant/qdrant_client.py | 30 ++--- dlt/destinations/impl/redshift/redshift.py | 5 +- dlt/destinations/impl/snowflake/snowflake.py | 7 +- dlt/destinations/impl/snowflake/sql_client.py | 3 +- dlt/destinations/impl/synapse/synapse.py | 5 +- .../impl/weaviate/weaviate_client.py | 3 +- dlt/destinations/job_client_impl.py | 5 +- dlt/destinations/sql_client.py | 29 +++-- dlt/destinations/sql_jobs.py | 12 +- dlt/normalize/worker.py | 2 +- .../docs/dlt-ecosystem/destinations/athena.md | 7 +- .../dlt-ecosystem/destinations/bigquery.md | 16 +++ .../docs/dlt-ecosystem/destinations/mssql.md | 9 ++ .../dlt-ecosystem/destinations/postgres.md | 4 + .../dlt-ecosystem/destinations/redshift.md | 11 ++ .../dlt-ecosystem/destinations/snowflake.md | 5 +- docs/website/docs/dlt-ecosystem/staging.md | 30 ++++- .../docs/general-usage/naming-convention.md | 93 +++++++++------- docs/website/docs/reference/performance.md | 6 + .../docs/running-in-production/running.md | 10 +- pyproject.toml | 2 +- tests/cases.py | 6 +- tests/common/schema/test_inference.py | 59 +++++++++- tests/common/test_destination.py | 84 ++++++++++++++ .../bigquery/test_bigquery_table_builder.py | 68 +++++++++++- tests/load/pipeline/test_pipelines.py | 2 +- .../postgres/test_postgres_table_builder.py | 16 +++ .../redshift/test_redshift_table_builder.py | 47 +++++++- .../snowflake/test_snowflake_table_builder.py | 57 +++++++++- tests/load/test_job_client.py | 103 +++++++++++++----- tests/load/test_sql_client.py | 29 ++++- tests/pipeline/test_dlt_versions.py | 9 +- tests/pipeline/test_pipeline.py | 26 +++++ 57 files changed, 777 insertions(+), 177 deletions(-) diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index 595d3e0d26..a4835a8188 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -62,7 +62,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): casefold_identifier: Callable[[str], str] = str """Casing function applied by destination to represent case insensitive identifiers.""" has_case_sensitive_identifiers: bool = None - """Tells if identifiers in destination are case sensitive, before case_identifier function is applied""" + """Tells if destination supports case sensitive identifiers""" decimal_precision: Tuple[int, int] = None wei_precision: Tuple[int, int] = None max_identifier_length: int = None @@ -96,6 +96,11 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): loader_parallelism_strategy: Optional[TLoaderParallelismStrategy] = None """The destination can override the parallelism strategy""" + def generates_case_sensitive_identifiers(self) -> bool: + """Tells if capabilities as currently adjusted, will generate case sensitive identifiers""" + # must have case sensitive support and folding function must preserve casing + return self.has_case_sensitive_identifiers and self.casefold_identifier is str + @staticmethod def generic_capabilities( preferred_loader_file_format: TLoaderFileFormat = None, diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 259389a5e9..dd4fbc8e13 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -184,6 +184,8 @@ class DestinationClientDwhConfiguration(DestinationClientConfiguration): """name of default schema to be used to name effective dataset to load data to""" replace_strategy: TLoaderReplaceStrategy = "truncate-and-insert" """How to handle replace disposition for this destination, can be classic or staging""" + staging_dataset_name_layout: str = "%s_staging" + """Layout for staging dataset, where %s is replaced with dataset name. placeholder is optional""" def _bind_dataset_name( self: TDestinationDwhClient, dataset_name: str, default_schema_name: str = None @@ -201,21 +203,37 @@ def normalize_dataset_name(self, schema: Schema) -> str: If default schema name is None or equals schema.name, the schema suffix is skipped. """ - if not schema.name: + dataset_name = self._make_dataset_name(schema.name) + return ( + dataset_name + if not dataset_name + else schema.naming.normalize_table_identifier(dataset_name) + ) + + def normalize_staging_dataset_name(self, schema: Schema) -> str: + """Builds staging dataset name out of dataset_name and staging_dataset_name_layout.""" + if "%s" in self.staging_dataset_name_layout: + # if dataset name is empty, staging dataset name is also empty + dataset_name = self._make_dataset_name(schema.name) + if not dataset_name: + return dataset_name + # fill the placeholder + dataset_name = self.staging_dataset_name_layout % dataset_name + else: + # no placeholder, then layout is a full name. so you can have a single staging dataset + dataset_name = self.staging_dataset_name_layout + + return schema.naming.normalize_table_identifier(dataset_name) + + def _make_dataset_name(self, schema_name: str) -> str: + if not schema_name: raise ValueError("schema_name is None or empty") # if default schema is None then suffix is not added - if self.default_schema_name is not None and schema.name != self.default_schema_name: - # also normalize schema name. schema name is Python identifier and here convention may be different - return schema.naming.normalize_table_identifier( - (self.dataset_name or "") + "_" + schema.name - ) + if self.default_schema_name is not None and schema_name != self.default_schema_name: + return (self.dataset_name or "") + "_" + schema_name - return ( - self.dataset_name - if not self.dataset_name - else schema.naming.normalize_table_identifier(self.dataset_name) - ) + return self.dataset_name @configspec diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 9ef638e289..39db0e42ae 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -505,19 +505,29 @@ def get_new_table_columns( self, table_name: str, existing_columns: TTableSchemaColumns, - case_sensitive: bool = True, + case_sensitive: bool, include_incomplete: bool = False, ) -> List[TColumnSchema]: """Gets new columns to be added to `existing_columns` to bring them up to date with `table_name` schema. - Columns names are compared case sensitive by default. + Columns names are compared case sensitive by default. `existing_column` names are expected to be normalized. + Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored. + Optionally includes incomplete columns (without data type)""" casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] casefold_existing = { casefold_f(col_name): col for col_name, col in existing_columns.items() } + if len(existing_columns) != len(casefold_existing): + raise SchemaCorruptedException( + self.name, + f"A set of existing columns passed to get_new_table_columns table {table_name} has" + " colliding names when case insensitive comparison is used. Original names:" + f" {list(existing_columns.keys())}. Case-folded names:" + f" {list(casefold_existing.keys())}", + ) diff_c: List[TColumnSchema] = [] - s_t = self.get_table_columns(table_name, include_incomplete=include_incomplete) - for c in s_t.values(): + updated_columns = self.get_table_columns(table_name, include_incomplete=include_incomplete) + for c in updated_columns.values(): if casefold_f(c["name"]) not in casefold_existing: diff_c.append(c) return diff_c diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index 2b76ca782e..4225d63fe7 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -164,7 +164,7 @@ class AthenaMergeJob(SqlMergeJob): @classmethod def _new_temp_table_name(cls, name_prefix: str, sql_client: SqlClientBase[Any]) -> str: # reproducible name so we know which table to drop - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): return sql_client.make_qualified_table_name(name_prefix) @classmethod @@ -224,10 +224,11 @@ class AthenaSQLClient(SqlClientBase[Connection]): def __init__( self, dataset_name: str, + staging_dataset_name: str, config: AthenaClientConfiguration, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(None, dataset_name, capabilities) + super().__init__(None, dataset_name, staging_dataset_name, capabilities) self._conn: Connection = None self.config = config self.credentials = config.credentials @@ -381,7 +382,12 @@ def __init__( table_needs_own_folder=True, ) - sql_client = AthenaSQLClient(config.normalize_dataset_name(schema), config, capabilities) + sql_client = AthenaSQLClient( + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config, + capabilities, + ) super().__init__(schema, config, sql_client) self.sql_client: AthenaSQLClient = sql_client # type: ignore self.config: AthenaClientConfiguration = config diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 6db9034e34..0f6b8f4838 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -182,6 +182,7 @@ def __init__( ) -> None: sql_client = BigQuerySqlClient( config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), config.credentials, capabilities, config.get_location(), diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index 0e2403f7d9..ef4e63ca12 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -15,6 +15,9 @@ class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): credentials: GcpServiceAccountCredentials = None location: str = "US" has_case_sensitive_identifiers: bool = True + """If True then dlt expects to load data into case sensitive dataset""" + should_set_case_sensitivity_on_new_dataset: bool = False + """If True, dlt will set case sensitivity flag on created datasets that corresponds to naming convention""" http_timeout: float = 15.0 # connection timeout for http request to BigQuery api file_upload_timeout: float = 30 * 60.0 # a timeout for file upload when loading local files diff --git a/dlt/destinations/impl/bigquery/factory.py b/dlt/destinations/impl/bigquery/factory.py index db61a6042a..b3096e9312 100644 --- a/dlt/destinations/impl/bigquery/factory.py +++ b/dlt/destinations/impl/bigquery/factory.py @@ -88,5 +88,6 @@ def adjust_capabilities( naming: t.Optional[NamingConvention], ) -> DestinationCapabilitiesContext: # modify the caps if case sensitive identifiers are requested - caps.has_case_sensitive_identifiers = config.has_case_sensitive_identifiers + if config.should_set_case_sensitivity_on_new_dataset: + caps.has_case_sensitive_identifiers = config.has_case_sensitive_identifiers return super().adjust_capabilities(caps, config, naming) diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py index e6aee1fc43..dfc4094e7b 100644 --- a/dlt/destinations/impl/bigquery/sql_client.py +++ b/dlt/destinations/impl/bigquery/sql_client.py @@ -78,6 +78,7 @@ class BigQuerySqlClient(SqlClientBase[bigquery.Client], DBTransaction): def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: GcpServiceAccountCredentialsWithoutDefaults, capabilities: DestinationCapabilitiesContext, location: str = "US", @@ -88,7 +89,7 @@ def __init__( self.credentials: GcpServiceAccountCredentialsWithoutDefaults = credentials self.location = location self.http_timeout = http_timeout - super().__init__(credentials.project_id, dataset_name, capabilities) + super().__init__(credentials.project_id, dataset_name, staging_dataset_name, capabilities) self._default_retry = bigquery.DEFAULT_RETRY.with_deadline(retry_deadline) self._default_query = bigquery.QueryJobConfig( diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index 6dd8fd47ed..d08e91758a 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -295,7 +295,10 @@ def __init__( capabilities: DestinationCapabilitiesContext, ) -> None: self.sql_client: ClickHouseSqlClient = ClickHouseSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) super().__init__(schema, config, self.sql_client) self.config: ClickHouseClientConfiguration = config diff --git a/dlt/destinations/impl/clickhouse/sql_client.py b/dlt/destinations/impl/clickhouse/sql_client.py index 8544643017..244db578b1 100644 --- a/dlt/destinations/impl/clickhouse/sql_client.py +++ b/dlt/destinations/impl/clickhouse/sql_client.py @@ -49,10 +49,11 @@ class ClickHouseSqlClient( def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: ClickHouseCredentials, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(credentials.database, dataset_name, capabilities) + super().__init__(credentials.database, dataset_name, staging_dataset_name, capabilities) self._conn: clickhouse_driver.dbapi.connection = None self.credentials = credentials self.database_name = credentials.database diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py index 62debdedb7..fbe7fa4c6b 100644 --- a/dlt/destinations/impl/databricks/databricks.py +++ b/dlt/destinations/impl/databricks/databricks.py @@ -261,7 +261,10 @@ def __init__( capabilities: DestinationCapabilitiesContext, ) -> None: sql_client = DatabricksSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) super().__init__(schema, config, sql_client) self.config: DatabricksClientConfiguration = config diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 2af27020ee..4c06ef1cf3 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -48,10 +48,11 @@ class DatabricksSqlClient(SqlClientBase[DatabricksSqlConnection], DBTransaction) def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: DatabricksCredentials, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(credentials.catalog, dataset_name, capabilities) + super().__init__(credentials.catalog, dataset_name, staging_dataset_name, capabilities) self._conn: DatabricksSqlConnection = None self.credentials = credentials diff --git a/dlt/destinations/impl/dremio/dremio.py b/dlt/destinations/impl/dremio/dremio.py index 00e51b74a6..bea18cdea5 100644 --- a/dlt/destinations/impl/dremio/dremio.py +++ b/dlt/destinations/impl/dremio/dremio.py @@ -143,7 +143,10 @@ def __init__( capabilities: DestinationCapabilitiesContext, ) -> None: sql_client = DremioSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) super().__init__(schema, config, sql_client) self.config: DremioClientConfiguration = config diff --git a/dlt/destinations/impl/dremio/sql_client.py b/dlt/destinations/impl/dremio/sql_client.py index 929aa2a0d8..7dee056da7 100644 --- a/dlt/destinations/impl/dremio/sql_client.py +++ b/dlt/destinations/impl/dremio/sql_client.py @@ -37,10 +37,11 @@ class DremioSqlClient(SqlClientBase[pydremio.DremioConnection]): def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: DremioCredentials, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(credentials.database, dataset_name, capabilities) + super().__init__(credentials.database, dataset_name, staging_dataset_name, capabilities) self._conn: Optional[pydremio.DremioConnection] = None self.credentials = credentials diff --git a/dlt/destinations/impl/duckdb/duck.py b/dlt/destinations/impl/duckdb/duck.py index b87a2c4780..10d4fc13de 100644 --- a/dlt/destinations/impl/duckdb/duck.py +++ b/dlt/destinations/impl/duckdb/duck.py @@ -157,7 +157,10 @@ def __init__( capabilities: DestinationCapabilitiesContext, ) -> None: sql_client = DuckDbSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) super().__init__(schema, config, sql_client) self.config: DuckDbClientConfiguration = config diff --git a/dlt/destinations/impl/duckdb/sql_client.py b/dlt/destinations/impl/duckdb/sql_client.py index 95762a1f26..80bbbedc9c 100644 --- a/dlt/destinations/impl/duckdb/sql_client.py +++ b/dlt/destinations/impl/duckdb/sql_client.py @@ -46,10 +46,11 @@ class DuckDbSqlClient(SqlClientBase[duckdb.DuckDBPyConnection], DBTransaction): def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: DuckDbBaseCredentials, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(None, dataset_name, capabilities) + super().__init__(None, dataset_name, staging_dataset_name, capabilities) self._conn: duckdb.DuckDBPyConnection = None self.credentials = credentials diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py index 79a5de7f77..8265e50fbf 100644 --- a/dlt/destinations/impl/lancedb/lancedb_client.py +++ b/dlt/destinations/impl/lancedb/lancedb_client.py @@ -454,7 +454,11 @@ def add_table_fields( def _execute_schema_update(self, only_tables: Iterable[str]) -> None: for table_name in only_tables or self.schema.tables: exists, existing_columns = self.get_storage_table(table_name) - new_columns = self.schema.get_new_table_columns(table_name, existing_columns) + new_columns = self.schema.get_new_table_columns( + table_name, + existing_columns, + self.capabilities.generates_case_sensitive_identifiers(), + ) embedding_fields: List[str] = get_columns_names_with_prop( self.schema.get_table(table_name), VECTORIZE_HINT ) diff --git a/dlt/destinations/impl/motherduck/motherduck.py b/dlt/destinations/impl/motherduck/motherduck.py index 3a5f172864..5a700294fe 100644 --- a/dlt/destinations/impl/motherduck/motherduck.py +++ b/dlt/destinations/impl/motherduck/motherduck.py @@ -16,7 +16,10 @@ def __init__( ) -> None: super().__init__(schema, config, capabilities) # type: ignore sql_client = MotherDuckSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) self.config: MotherDuckClientConfiguration = config # type: ignore self.sql_client: MotherDuckSqlClient = sql_client diff --git a/dlt/destinations/impl/motherduck/sql_client.py b/dlt/destinations/impl/motherduck/sql_client.py index 40157406ab..5d680160f5 100644 --- a/dlt/destinations/impl/motherduck/sql_client.py +++ b/dlt/destinations/impl/motherduck/sql_client.py @@ -9,10 +9,11 @@ class MotherDuckSqlClient(DuckDbSqlClient): def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: MotherDuckCredentials, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(dataset_name, credentials, capabilities) + super().__init__(dataset_name, staging_dataset_name, credentials, capabilities) self.database_name = credentials.database def catalog_name(self, escape: bool = True) -> Optional[str]: diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index 25aab5c52a..ec4a54d6f7 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -95,7 +95,7 @@ def generate_sql( ) -> List[str]: sql: List[str] = [] for table in table_chain: - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): staging_table_name = sql_client.make_qualified_table_name(table["name"]) table_name = sql_client.make_qualified_table_name(table["name"]) # drop destination table @@ -149,7 +149,10 @@ def __init__( capabilities: DestinationCapabilitiesContext, ) -> None: sql_client = PyOdbcMsSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) super().__init__(schema, config, sql_client) self.config: MsSqlClientConfiguration = config diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py index 988b461fa7..e1b51743f5 100644 --- a/dlt/destinations/impl/mssql/sql_client.py +++ b/dlt/destinations/impl/mssql/sql_client.py @@ -45,10 +45,11 @@ class PyOdbcMsSqlClient(SqlClientBase[pyodbc.Connection], DBTransaction): def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: MsSqlCredentials, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(credentials.database, dataset_name, capabilities) + super().__init__(credentials.database, dataset_name, staging_dataset_name, capabilities) self._conn: pyodbc.Connection = None self.credentials = credentials diff --git a/dlt/destinations/impl/postgres/factory.py b/dlt/destinations/impl/postgres/factory.py index b873bf97d5..0fe8c6d13e 100644 --- a/dlt/destinations/impl/postgres/factory.py +++ b/dlt/destinations/impl/postgres/factory.py @@ -32,7 +32,6 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps.casefold_identifier = str.lower caps.has_case_sensitive_identifiers = True caps.escape_literal = escape_postgres_literal - caps.has_case_sensitive_identifiers = True caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) caps.wei_precision = (2 * EVM_DECIMAL_PRECISION, EVM_DECIMAL_PRECISION) caps.max_identifier_length = 63 diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py index 7b173a7711..f47549fc4f 100644 --- a/dlt/destinations/impl/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -95,7 +95,7 @@ def generate_sql( ) -> List[str]: sql: List[str] = [] for table in table_chain: - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): staging_table_name = sql_client.make_qualified_table_name(table["name"]) table_name = sql_client.make_qualified_table_name(table["name"]) # drop destination table @@ -211,7 +211,10 @@ def __init__( capabilities: DestinationCapabilitiesContext, ) -> None: sql_client = Psycopg2SqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) super().__init__(schema, config, sql_client) self.config: PostgresClientConfiguration = config diff --git a/dlt/destinations/impl/postgres/sql_client.py b/dlt/destinations/impl/postgres/sql_client.py index 38bfc212d5..d867248196 100644 --- a/dlt/destinations/impl/postgres/sql_client.py +++ b/dlt/destinations/impl/postgres/sql_client.py @@ -34,10 +34,11 @@ class Psycopg2SqlClient(SqlClientBase["psycopg2.connection"], DBTransaction): def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: PostgresCredentials, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(credentials.database, dataset_name, capabilities) + super().__init__(credentials.database, dataset_name, staging_dataset_name, capabilities) self._conn: psycopg2.connection = None self.credentials = credentials diff --git a/dlt/destinations/impl/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py index 80c158d51a..080c277edd 100644 --- a/dlt/destinations/impl/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -491,24 +491,26 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: exists = self._collection_exists(table_name) qualified_collection_name = self._make_qualified_collection_name(table_name) + # NOTE: there are no property schemas in qdrant so we do not need to alter + # existing collections if not exists: self._create_collection( full_collection_name=qualified_collection_name, ) - if not is_local: # Indexes don't work in local Qdrant (trigger log warning) - # Create indexes to enable order_by in state and schema tables - if table_name == self.schema.state_table_name: - self.db_client.create_payload_index( - collection_name=qualified_collection_name, - field_name=self.schema.naming.normalize_identifier("created_at"), - field_schema="datetime", - ) - elif table_name == self.schema.version_table_name: - self.db_client.create_payload_index( - collection_name=qualified_collection_name, - field_name=self.schema.naming.normalize_identifier("inserted_at"), - field_schema="datetime", - ) + if not is_local: # Indexes don't work in local Qdrant (trigger log warning) + # Create indexes to enable order_by in state and schema tables + if table_name == self.schema.state_table_name: + self.db_client.create_payload_index( + collection_name=qualified_collection_name, + field_name=self.schema.naming.normalize_identifier("created_at"), + field_schema="datetime", + ) + elif table_name == self.schema.version_table_name: + self.db_client.create_payload_index( + collection_name=qualified_collection_name, + field_name=self.schema.naming.normalize_identifier("inserted_at"), + field_schema="datetime", + ) self._update_schema_in_storage(self.schema) diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py index faa037078a..8eacc76d11 100644 --- a/dlt/destinations/impl/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -231,7 +231,10 @@ def __init__( capabilities: DestinationCapabilitiesContext, ) -> None: sql_client = RedshiftSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) super().__init__(schema, config, sql_client) self.sql_client = sql_client diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index b0786e9ed6..532ff404ae 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -110,7 +110,7 @@ def __init__( case_folding = ( "CASE_SENSITIVE" - if client.capabilities.casefold_identifier is str + if client.capabilities.generates_case_sensitive_identifiers() else "CASE_INSENSITIVE" ) column_match_clause = f"MATCH_BY_COLUMN_NAME='{case_folding}'" @@ -228,7 +228,10 @@ def __init__( capabilities: DestinationCapabilitiesContext, ) -> None: sql_client = SnowflakeSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) super().__init__(schema, config, sql_client) self.config: SnowflakeClientConfiguration = config diff --git a/dlt/destinations/impl/snowflake/sql_client.py b/dlt/destinations/impl/snowflake/sql_client.py index e033a9f455..fbc80b7b6c 100644 --- a/dlt/destinations/impl/snowflake/sql_client.py +++ b/dlt/destinations/impl/snowflake/sql_client.py @@ -34,10 +34,11 @@ class SnowflakeSqlClient(SqlClientBase[snowflake_lib.SnowflakeConnection], DBTra def __init__( self, dataset_name: str, + staging_dataset_name: str, credentials: SnowflakeCredentials, capabilities: DestinationCapabilitiesContext, ) -> None: - super().__init__(credentials.database, dataset_name, capabilities) + super().__init__(credentials.database, dataset_name, staging_dataset_name, capabilities) self._conn: snowflake_lib.SnowflakeConnection = None self.credentials = credentials diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index de2f9d4472..408bfc2b53 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -62,7 +62,10 @@ def __init__( super().__init__(schema, config, capabilities) self.config: SynapseClientConfiguration = config self.sql_client = SynapseSqlClient( - config.normalize_dataset_name(schema), config.credentials, capabilities + config.normalize_dataset_name(schema), + config.normalize_staging_dataset_name(schema), + config.credentials, + capabilities, ) self.active_hints = deepcopy(HINT_TO_SYNAPSE_ATTR) diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py index 71f2f13e76..dfbf83d7e5 100644 --- a/dlt/destinations/impl/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -467,8 +467,7 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: new_columns = self.schema.get_new_table_columns( table_name, existing_columns, - case_sensitive=self.capabilities.has_case_sensitive_identifiers - and self.capabilities.casefold_identifier is str, + case_sensitive=self.capabilities.generates_case_sensitive_identifiers(), ) logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}") if len(new_columns) > 0: diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 1dfa24a4ca..e00b7ebb05 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -607,8 +607,7 @@ def _create_table_update( updates = self.schema.get_new_table_columns( table_name, storage_columns, - case_sensitive=self.capabilities.has_case_sensitive_identifiers - and self.capabilities.casefold_identifier is str, + case_sensitive=self.capabilities.generates_case_sensitive_identifiers(), ) logger.info(f"Found {len(updates)} updates for {table_name} in {self.schema.name}") return updates @@ -684,7 +683,7 @@ class SqlJobClientWithStaging(SqlJobClientBase, WithStagingDataset): @contextlib.contextmanager def with_staging_dataset(self) -> Iterator["SqlJobClientBase"]: try: - with self.sql_client.with_staging_dataset(True): + with self.sql_client.with_staging_dataset(): self.in_staging_mode = True yield self finally: diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py index f74f1b9224..fbe2b17fc2 100644 --- a/dlt/destinations/sql_client.py +++ b/dlt/destinations/sql_client.py @@ -31,12 +31,26 @@ class SqlClientBase(ABC, Generic[TNativeConn]): dbapi: ClassVar[DBApi] = None + database_name: Optional[str] + """Database or catalog name, optional""" + dataset_name: str + """Normalized dataset name""" + staging_dataset_name: str + """Normalized staging dataset name""" + capabilities: DestinationCapabilitiesContext + """Instance of adjusted destination capabilities""" + def __init__( - self, database_name: str, dataset_name: str, capabilities: DestinationCapabilitiesContext + self, + database_name: str, + dataset_name: str, + staging_dataset_name: str, + capabilities: DestinationCapabilitiesContext, ) -> None: if not dataset_name: raise ValueError(dataset_name) self.dataset_name = dataset_name + self.staging_dataset_name = staging_dataset_name self.database_name = database_name self.capabilities = capabilities @@ -193,13 +207,8 @@ def with_alternative_dataset_name( # restore previous dataset name self.dataset_name = current_dataset_name - def with_staging_dataset( - self, staging: bool = False - ) -> ContextManager["SqlClientBase[TNativeConn]"]: - dataset_name = self.dataset_name - if staging: - dataset_name = SqlClientBase.make_staging_dataset_name(dataset_name) - return self.with_alternative_dataset_name(dataset_name) + def with_staging_dataset(self) -> ContextManager["SqlClientBase[TNativeConn]"]: + return self.with_alternative_dataset_name(self.staging_dataset_name) def _ensure_native_conn(self) -> None: if not self.native_connection: @@ -216,10 +225,6 @@ def is_dbapi_exception(ex: Exception) -> bool: mro = type.mro(type(ex)) return any(t.__name__ in ("DatabaseError", "DataError") for t in mro) - @staticmethod - def make_staging_dataset_name(dataset_name: str) -> str: - return dataset_name + "_staging" - def _get_information_schema_components(self, *tables: str) -> Tuple[str, str, List[str]]: """Gets catalog name, schema name and name of the tables in format that can be directly used to query INFORMATION_SCHEMA. catalog name is optional: in that case None is diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index b9539fe114..1715389e17 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -95,7 +95,7 @@ def _generate_clone_sql( """Drop and clone the table for supported destinations""" sql: List[str] = [] for table in table_chain: - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): staging_table_name = sql_client.make_qualified_table_name(table["name"]) table_name = sql_client.make_qualified_table_name(table["name"]) sql.append(f"DROP TABLE IF EXISTS {table_name};") @@ -112,7 +112,7 @@ def _generate_insert_sql( ) -> List[str]: sql: List[str] = [] for table in table_chain: - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): staging_table_name = sql_client.make_qualified_table_name(table["name"]) table_name = sql_client.make_qualified_table_name(table["name"]) columns = ", ".join( @@ -368,7 +368,7 @@ def gen_merge_sql( # get top level table full identifiers root_table_name = sql_client.make_qualified_table_name(root_table["name"]) - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): staging_root_table_name = sql_client.make_qualified_table_name(root_table["name"]) # get merge and primary keys from top level @@ -489,7 +489,7 @@ def gen_merge_sql( # insert from staging to dataset for table in table_chain: table_name = sql_client.make_qualified_table_name(table["name"]) - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): staging_table_name = sql_client.make_qualified_table_name(table["name"]) insert_cond = not_deleted_cond if hard_delete_col is not None else "1 = 1" @@ -527,7 +527,7 @@ def gen_scd2_sql( sql: List[str] = [] root_table = table_chain[0] root_table_name = sql_client.make_qualified_table_name(root_table["name"]) - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): staging_root_table_name = sql_client.make_qualified_table_name(root_table["name"]) # get column names @@ -599,7 +599,7 @@ def gen_scd2_sql( # - this write disposition is way more similar to regular merge (how root tables are handled is different, other tables handled same) for table in child_tables: table_name = sql_client.make_qualified_table_name(table["name"]) - with sql_client.with_staging_dataset(staging=True): + with sql_client.with_staging_dataset(): staging_table_name = sql_client.make_qualified_table_name(table["name"]) sql.append(f""" INSERT INTO {table_name} diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py index cd50c56e09..10d0a00eb1 100644 --- a/dlt/normalize/worker.py +++ b/dlt/normalize/worker.py @@ -165,7 +165,7 @@ def _get_items_normalizer( item_storage = load_storage.create_item_storage(best_writer_spec) if not is_native_writer(item_storage.writer_cls): logger.warning( - f"For data items yielded as {item_format} and job file format" + f"For data items in `{table_name}` yielded as {item_format} and job file format" f" {best_writer_spec.file_format} native writer could not be found. A" f" {item_storage.writer_cls.__name__} writer is used that internally" f" converts {item_format}. This will degrade performance." diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index a723e3554c..2a8b8c6b9d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -102,8 +102,11 @@ Athena does not support JSON fields, so JSON is stored as a string. > ❗**Athena does not support TIME columns in parquet files**. `dlt` will fail such jobs permanently. Convert `datetime.time` objects to `str` or `datetime.datetime` to load them. -### Naming Convention -We follow our snake_case name convention. Keep the following in mind: +### Table and column identifiers +Athena uses case insensitive identifiers and **will lower case all the identifiers** that are stored in the INFORMATION SCHEMA. Do not use +[case sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations). Letter casing will be removed anyway and you risk to generate identifier collisions, which are detected by `dlt` and will fail the load process. + +Under the hood Athena uses different SQL engines for DDL (catalog) and DML/Queries: * DDL uses HIVE escaping with `````` * Other queries use PRESTO and regular SQL escaping. diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index f97a4a96bb..4d92043fb5 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -167,6 +167,22 @@ BigQuery supports the following [column hints](https://dlthub.com/docs/general-u * `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. +### Table and column identifiers +BigQuery uses case sensitive identifiers by default and this is what `dlt` assumes. If the dataset you use has case insensitive identifiers (you have such option +when you create it) make sure that you use case insensitive [naming convention](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) or you tell `dlt` about it so identifier collisions are properly detected. +```toml +[destination.bigquery] +has_case_sensitive_identifiers=false +``` + +You have an option to allow `dlt` to set the case sensitivity for newly created datasets. In that case it will follow the case sensitivity of current +naming convention (ie. the default **snake_case** will create dataset with case insensitive identifiers). +```toml +[destination.bigquery] +should_set_case_sensitivity_on_new_dataset=true +``` +The option above is off by default. + ## Staging Support BigQuery supports GCS as a file staging destination. `dlt` will upload files in the parquet format to GCS and ask BigQuery to copy their data directly into the database. diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index 6aac877d7b..0512fd5fca 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -114,6 +114,15 @@ Data is loaded via INSERT statements by default. MSSQL has a limit of 1000 rows ## Supported column hints **mssql** will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. +### Table and column identifiers +SQL Server **with the default collation** uses case insensitive identifiers but will preserve the casing of identifiers that are stored in the INFORMATION SCHEMA. You can use [case sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) to keep the identifier casing. Note that you risk to generate identifier collisions, which are detected by `dlt` and will fail the load process. + +If you change SQL Server server/database collation to case sensitive, this will also affect the identifiers. Configure your destination as below in order to use case sensitive naming conventions without collisions: +```toml +[destination.mssql] +has_case_sensitive_identifiers=true +``` + ## Syncing of `dlt` state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index 49b3c06208..1281298312 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -98,6 +98,10 @@ In the example above `arrow_table` will be converted to csv with **pyarrow** and ## Supported column hints `postgres` will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. +### Table and column identifiers +Postgres supports both case sensitive and case insensitive identifiers. All unquoted and lowercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate +case sensitive identifiers that must be quoted in SQL statements. + ## Additional destination options The Postgres destination creates UNIQUE indexes by default on columns with the `unique` hint (i.e., `_dlt_id`). To disable this behavior: ```toml diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index ab193c755d..bb92d651f2 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -93,6 +93,17 @@ Amazon Redshift supports the following column hints: - `cluster` - This hint is a Redshift term for table distribution. Applying it to a column makes it the "DISTKEY," affecting query and join performance. Check the following [documentation](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-best-dist-key.html) for more info. - `sort` - This hint creates a SORTKEY to order rows on disk physically. It is used to improve query and join speed in Redshift. Please read the [sort key docs](https://docs.aws.amazon.com/redshift/latest/dg/c_best-practices-sort-key.html) to learn more. +### Table and column identifiers +Redshift **by default** uses case insensitive identifiers and **will lower case all the identifiers** that are stored in the INFORMATION SCHEMA. Do not use +[case sensitive naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations). Letter casing will be removed anyway and you risk to generate identifier collisions, which are detected by `dlt` and will fail the load process. + +You can [put Redshift in case sensitive mode](https://docs.aws.amazon.com/redshift/latest/dg/r_enable_case_sensitive_identifier.html). Configure your destination as below in order to use case sensitive naming conventions: +```toml +[destination.redshift] +has_case_sensitive_identifiers=true +``` + + ## Staging support Redshift supports s3 as a file staging destination. dlt will upload files in the parquet format to s3 and ask Redshift to copy their data directly into the db. Please refer to the [S3 documentation](./filesystem.md#aws-s3) to learn how to set up your s3 bucket with the bucket_url and credentials. The `dlt` Redshift loader will use the AWS credentials provided for s3 to access the s3 bucket if not specified otherwise (see config options below). Alternatively to parquet files, you can also specify jsonl as the staging file format. For this, set the `loader_file_format` argument of the `run` command of the pipeline to `jsonl`. diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index b92d242c8a..33d199376c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -171,9 +171,12 @@ Snowflake supports the following [column hints](https://dlthub.com/docs/general- * `cluster` - creates a cluster column(s). Many columns per table are supported and only when a new table is created. ### Table and column identifiers -Snowflake makes all unquoted identifiers uppercase and then resolves them case-insensitively in SQL statements. `dlt` (effectively) does not quote identifiers in DDL, preserving default behavior. +Snowflake supports both case sensitive and case insensitive identifiers. All unquoted and uppercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate +case sensitive identifiers that must be quoted in SQL statements. +:::note Names of tables and columns in [schemas](../../general-usage/schema.md) are kept in lower case like for all other destinations. This is the pattern we observed in other tools, i.e., `dbt`. In the case of `dlt`, it is, however, trivial to define your own uppercase [naming convention](../../general-usage/schema.md#naming-convention) +::: ## Staging support diff --git a/docs/website/docs/dlt-ecosystem/staging.md b/docs/website/docs/dlt-ecosystem/staging.md index e3a60dfa51..05e31a574b 100644 --- a/docs/website/docs/dlt-ecosystem/staging.md +++ b/docs/website/docs/dlt-ecosystem/staging.md @@ -7,9 +7,37 @@ keywords: [staging, destination] The goal of staging is to bring the data closer to the database engine so the modification of the destination (final) dataset happens faster and without errors. `dlt`, when asked, creates two staging areas: -1. A **staging dataset** used by the [merge and replace loads](../general-usage/incremental-loading.md#merge-incremental_loading) to deduplicate and merge data with the destination. Such staging dataset has the same name as the dataset passed to `dlt.pipeline` but with `_staging` suffix in the name. As a user you typically never see and directly interact with it. +1. A **staging dataset** used by the [merge and replace loads](../general-usage/incremental-loading.md#merge-incremental_loading) to deduplicate and merge data with the destination. 2. A **staging storage** which is typically a s3/gcp bucket where [loader files](file-formats/) are copied before they are loaded by the destination. +## Staging dataset +`dlt` creates a staging dataset when write disposition of any of the loaded resources requires it. It creates and migrates required tables exactly like for the +main dataset. Data in staging tables is truncated when load step begins and only for tables that will participate in it. +Such staging dataset has the same name as the dataset passed to `dlt.pipeline` but with `_staging` suffix in the name. Alternatively, you can provide your own staging dataset pattern or use a fixed name, identical for all the +configured datasets. +```toml +[destination.postgres] +staging_dataset_name_layout="staging_%s" +``` +Entry above switches the pattern to `staging_` prefix and for example for dataset with name **github_data** `dlt` will create **staging_github_data**. + +To configure static staging dataset name, you can do the following (we use destination factory) +```py +import dlt + +dest_ = dlt.destinations.postgres(staging_dataset_name_layout="_dlt_staging") +``` +All pipelines using `dest_` as destination will use **staging_dataset** to store staging tables. Make sure that your pipelines are not overwriting each other's tables. + +### Cleanup up staging dataset automatically +`dlt` does not truncate tables in staging dataset at the end of the load. Data that is left after contains all the extracted data and may be useful for debugging. +If you prefer to truncate it, put the following line in `config.toml`: + +```toml +[load] +truncate_staging_dataset=true +``` + ## Staging storage `dlt` allows to chain destinations where the first one (`staging`) is responsible for uploading the files from local filesystem to the remote storage. It then generates followup jobs for the second destination that (typically) copy the files from remote storage into destination. diff --git a/docs/website/docs/general-usage/naming-convention.md b/docs/website/docs/general-usage/naming-convention.md index 72db7bf5f3..bf6e650b9c 100644 --- a/docs/website/docs/general-usage/naming-convention.md +++ b/docs/website/docs/general-usage/naming-convention.md @@ -5,23 +5,20 @@ keywords: [identifiers, snake case, case sensitive, case insensitive, naming] --- # Naming Convention -`dlt` creates table and column identifiers from the data. The data source that ie. a stream of JSON documents may have identifiers (i.e. key names in a dictionary) with any Unicode characters, of any length and naming style. On the other hand, destinations require that you follow strict rules when you name tables, columns or collections. +`dlt` creates table and column identifiers from the data. The data source ie. a stream of JSON documents may have identifiers (i.e. key names in a dictionary) with any Unicode characters, of any length and naming style. On the other hand, destinations require that you follow strict rules when you name tables, columns or collections. A good example is [Redshift](../dlt-ecosystem/destinations/redshift.md#naming-convention) that accepts case-insensitive alphanumeric identifiers with maximum 127 characters. -`dlt` groups tables from a single [source](source.md) in a [schema](schema.md). +`dlt` groups tables from a single [source](source.md) in a [schema](schema.md). Each schema defines **naming convention** that tells `dlt` how to translate identifiers to the +namespace that the destination understands. Naming conventions are in essence functions that map strings from the source identifier format into destination identifier format. For example our **snake_case** (default) naming convention will translate `DealFlow` source identifier into `deal_flow` destination identifier. -Each schema defines **naming convention** that tells `dlt` how to translate identifiers to the -namespace that the destination understands. Naming conventions are in essence functions that map strings from the source identifier format into destination identifier format. For example our **snake_case** (default) naming convention will translate `DealFlow` into `deal_flow` identifier. - -You can pick which naming convention to use. `dlt` provides a few to [choose from](#available-naming-conventions) or you can [easily add your own](#write-your-own-naming-convention). +You can pick which naming convention to use. `dlt` provides a few to [choose from](#available-naming-conventions). You can [easily add your own](#write-your-own-naming-convention) as well. :::tip -* Standard behavior of `dlt` is to **use the same naming convention for all destinations** so users see always the same tables and column names in their databases. -* Use simple, short small caps identifiers for everything so no normalization is needed +Standard behavior of `dlt` is to **use the same naming convention for all destinations** so users see always the same table and column names in their databases. ::: ### Use default naming convention (snake_case) -Case insensitive naming convention, converting source identifiers into lower case snake case with reduced alphabet. +**snake_case** is case insensitive naming convention, converting source identifiers into lower case snake case identifiers with reduced alphabet. - Spaces around identifier are trimmed - Keeps ascii alphanumerics and underscores, replaces all other characters with underscores (with the exceptions below) @@ -46,68 +43,90 @@ naming="sql_ci_v1" ## Source identifiers vs destination identifiers ### Pick the right identifier form when defining resources `dlt` keeps source (not normalized) identifiers during data [extraction](../reference/explainers/how-dlt-works.md#extract) and translates them during [normalization](../reference/explainers/how-dlt-works.md#normalize). For you it means: -1. If you write a [transformer](resource.md#process-resources-with-dlttransformer) or a [mapping/filtering function](resource.md#filter-transform-and-pivot-data), you will see the original data, without any normalization. Use the source key names to access the dicts! -2. If you define a `primary_key` or `cursor` that participate in [cursor field incremental loading](incremental-loading.md#incremental-loading-with-a-cursor-field) use the source identifiers (`dlt` uses them to inspect source data, `Incremental` class is a filtering function). +1. If you write a [transformer](resource.md#process-resources-with-dlttransformer) or a [mapping/filtering function](resource.md#filter-transform-and-pivot-data), you will see the original data, without any normalization. Use the source identifiers to access the dicts! +2. If you define a `primary_key` or `cursor` that participate in [cursor field incremental loading](incremental-loading.md#incremental-loading-with-a-cursor-field) use the source identifiers (`dlt` uses them to inspect source data, `Incremental` class is just a filtering function). 3. When defining any other hints ie. `columns` or `merge_key` you can pick source or destination identifiers. `dlt` normalizes all hints together with your data. 4. `Schema` object (ie. obtained from the pipeline or from `dlt` source via `discover_schema`) **always contains destination (normalized) identifiers**. -In the snippet below, we define a resource with various "illegal" unicode characters in table name and other hint and demonstrate how they get normalized in the schema object. -```py -``` - ### Understand the identifier normalization -Identifiers are translated from source to destination form in **normalize** step. Here's how `dlt` picks the right naming convention: +Identifiers are translated from source to destination form in **normalize** step. Here's how `dlt` picks the naming convention: -* Each destination may define a preferred naming convention (ie. Weaviate), otherwise **snake case** will be used -* This naming convention is used when new schemas are created. This happens when pipeline is run for a first time. -* Schemas preserve naming convention when saved. Your running pipelines will maintain existing naming conventions if not requested otherwise -* `dlt` applies final naming convention in `normalize` step. Naming convention comes from (1) explicit configuration (2) from destination capabilities. -* Naming convention will be used to put destination is case sensitive/insensitive mode and apply the right case folding function. +* The default naming convention is **snake_case**. +* Each destination may define a preferred naming convention in [destination capabilities](destination.md#pass-additional-parameters-and-change-destination-capabilities). Some destinations (ie. Weaviate) need specialized naming convention and will override the default. +* You can [configure a naming convention explicitly](#set-and-adjust-naming-convention-explicitly). Such configuration overrides the destination settings. +* This naming convention is used when new schemas are created. It happens when pipeline is run for the first time. +* Schemas preserve naming convention when saved. Your running pipelines will maintain existing naming conventions if not requested otherwise. +* `dlt` applies final naming convention in `normalize` step. Jobs (files) in load package now have destination identifiers. Pipeline schema is duplicated, locked and saved in the load package and will be used by the destination. :::caution -If you change naming convention and `dlt` detects that it changes the destination identifiers for tables/collection/files that already exist and store data, -the normalize process will fail. +If you change naming convention and `dlt` detects that a change in the destination identifiers for tables/collection/files that already exist and store data, +the normalize process will fail. This prevents an unwanted schema migration. New columns and tables will be created for identifiers that changed. ::: ### Case sensitive and insensitive destinations -Naming conventions come in two types. -* **case sensitive** -* **case insensitive** +Naming convention declare if the destination identifiers they produce are case sensitive or insensitive. This helps `dlt` to [generate case sensitive / insensitive identifiers for the destinations that support both](destination.md#control-how-dlt-creates-table-column-and-other-identifiers). For example: if you pick case insensitive naming like **snake_case** or **sql_ci_v1**, with Snowflake, `dlt` will generate all upper-case identifiers that Snowflake sees as case insensitive. If you pick case sensitive naming like **sql_cs_v1**, `dlt` will generate quoted case-sensitive identifiers that preserve identifier capitalization. -Case sensitive naming convention will put a destination in [case sensitive mode](destination.md#control-how-dlt-creates-table-column-and-other-identifiers). Identifiers that -differ only in casing will not [collide](#avoid-identifier-collisions). Note that many destinations are exclusively case insensitive, of which some preserve casing of identifiers (ie. **duckdb**) and some will case-fold identifiers when creating tables (ie. **Redshift**, **Athena** do lower case on the names). +Note that many destinations are exclusively case insensitive, of which some preserve casing of identifiers (ie. **duckdb**) and some will case-fold identifiers when creating tables (ie. **Redshift**, **Athena** do lower case on the names). `dlt` is able to detect resulting identifier [collisions](#avoid-identifier-collisions) and stop the load process before data is mangled. -## Identifier shortening +### Identifier shortening Identifier shortening happens during normalization. `dlt` takes the maximum length of the identifier from the destination capabilities and will trim the identifiers that are too long. The default shortening behavior generates short deterministic hashes of the source identifiers and places them in the middle of the destination identifier. This (with a high probability) avoids shortened identifier collisions. +### 🚧 [WIP] Name convention changes are lossy +`dlt` does not store the source identifiers in the schema so when naming convention changes (or we increase the maximum identifier length), it is not able to generate a fully correct set of new identifiers. Instead it will re-normalize already normalized identifiers. We are currently working to store full identifier lineage - source identifiers will be stored and mapped to the destination in the schema. ## Pick your own naming convention ### Configure naming convention -tbd. +You can use `config.toml`, environment variables or any other configuration provider to set the naming convention name. Configured naming convention **overrides all other settings** +- changes the naming convention stored in the already created schema +- overrides the destination capabilities preference. +```toml +[schema] +naming="sql_ci_v1" +``` +Configuration above will request **sql_ci_v1** for all pipelines (schemas). An environment variable `SCHEMA__NAMING` set to `sql_ci_v1` has the same effect. + +You have an option to set naming convention per source: +```toml +[sources.zendesk] +config="prop" +[sources.zendesk.schema] +naming="sql_cs_v1" +[sources.zendesk.credentials] +password="pass" +``` +Snippet above demonstrates how to apply certain naming for an example `zendesk` source. + +You can use naming conventions that you created yourself or got from other users. In that case you should pass a full Python import path to the [module that contain the naming convention](#write-your-own-naming-convention): +```toml +[schema] +naming="tests.common.cases.normalizers.sql_upper" +``` +`dlt` will import `tests.common.cases.normalizers.sql_upper` and use `NamingConvention` class found in it as the naming convention. ### Available naming conventions +You can pick from a few built-in naming conventions. -* snake_case -* duck_case - case sensitive, allows all unicode characters like emoji 💥 -* direct - case sensitive, allows all unicode characters, does not contract underscores +* `snake_case` - the default +* `duck_case` - case sensitive, allows all unicode characters like emoji 💥 +* `direct` - case sensitive, allows all unicode characters, does not contract underscores * `sql_cs_v1` - case sensitive, generates sql-safe identifiers * `sql_ci_v1` - case insensitive, generates sql-safe lower case identifiers ### Set and adjust naming convention explicitly -tbd. +You can modify destination capabilities to ## Avoid identifier collisions -`dlt` detects various types of collisions and ignores the others. +`dlt` detects various types of identifier collisions and ignores the others. 1. `dlt` detects collisions if case sensitive naming convention is used on case insensitive destination 2. `dlt` detects collisions if change of naming convention changes the identifiers of tables already created in the destination 3. `dlt` detects collisions when naming convention is applied to column names of arrow tables `dlt` will not detect collision when normalizing source data. If you have a dictionary, keys will be merged if they collide after being normalized. -You can use a naming convention that does not generate collisions, see examples below. +You can create a custom naming convention that does not generate collisions on data, see examples below. ## Write your own naming convention @@ -124,5 +143,5 @@ We include [two examples](../examples/custom_naming) of naming conventions that :::note Note that a fully qualified name of your custom naming convention will be stored in the `Schema` and `dlt` will attempt to import it when schema is loaded from storage. -You should distribute your custom naming conventions with your pipeline code via an installable package with a defined namespace. +You should distribute your custom naming conventions with your pipeline code or via a pip package from which it can be imported. ::: diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 7d8280d8ee..075d351553 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -223,6 +223,12 @@ resources are: `round_robin` and `fifo`. `fifo` is an option for sequential extraction. It will result in every resource being fully extracted until the resource generator is expired, or a configured limit is reached, then the next resource will be evaluated. Resources are extracted in the order that you added them to your source. +:::tip +Switch to `fifo` when debugging sources with many resources and connected transformers, for example [rest_api](../dlt-ecosystem/verified-sources/rest_api.md). +Your data will be requested in deterministic and straightforward order - given data item (ie. user record you got from API) will be processed by all resources +and transformers until completion before starting with new one +::: + You can change this setting in your `config.toml` as follows: diff --git a/docs/website/docs/running-in-production/running.md b/docs/website/docs/running-in-production/running.md index 9c52f58caa..377cf57f2c 100644 --- a/docs/website/docs/running-in-production/running.md +++ b/docs/website/docs/running-in-production/running.md @@ -105,13 +105,15 @@ package. In that case, for a correctly behaving pipeline, only minimum amount of behind. In `config.toml`: ```toml -load.delete_completed_jobs=true +[load] +delete_completed_jobs=true ``` -Also, by default, `dlt` leaves data in staging dataset, used during merge and replace load for deduplication. In order to clear it, put the following line in `config.toml`: +Also, by default, `dlt` leaves data in [staging dataset](../dlt-ecosystem/staging.md#staging-dataset), used during merge and replace load for deduplication. In order to clear it, put the following line in `config.toml`: ```toml -load.truncate_staging_dataset=true +[load] +truncate_staging_dataset=true ``` ## Using slack to send messages @@ -174,7 +176,7 @@ As with any other configuration, you can use environment variables instead of th - `RUNTIME__LOG_LEVEL` to set the log level - `LOG_FORMAT` to set the log format -`dlt` logs to a logger named **dlt**. `dlt` logger uses a regular python logger so you can configure the handlers +`dlt` logs to a logger named **dlt**. `dlt` logger uses a regular python logger so you can configure the handlers as per your requirement. For example, to put logs to the file: diff --git a/pyproject.toml b/pyproject.toml index fdbbf9ba37..f8c34a767e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.5.1a1" +version = "0.5.1" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] diff --git a/tests/cases.py b/tests/cases.py index d145ec1d94..fa346b8b49 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -173,7 +173,7 @@ def table_update_and_row( exclude_types: Sequence[TDataType] = None, exclude_columns: Sequence[str] = None -) -> Tuple[TTableSchemaColumns, StrAny]: +) -> Tuple[TTableSchemaColumns, Dict[str, Any]]: """Get a table schema and a row with all possible data types. Optionally exclude some data types from the schema and row. """ @@ -192,6 +192,7 @@ def table_update_and_row( def assert_all_data_types_row( db_row: Union[List[Any], TDataItems], + expected_row: Dict[str, Any] = None, parse_complex_strings: bool = False, allow_base64_binary: bool = False, timestamp_precision: int = 6, @@ -202,6 +203,7 @@ def assert_all_data_types_row( # content must equal # print(db_row) schema = schema or TABLE_UPDATE_COLUMNS_SCHEMA + expected_row = expected_row or TABLE_ROW_ALL_DATA_TYPES # Include only columns requested in schema if isinstance(db_row, dict): @@ -209,7 +211,7 @@ def assert_all_data_types_row( else: db_mapping = {col_name: db_row[i] for i, col_name in enumerate(schema)} - expected_rows = {key: value for key, value in TABLE_ROW_ALL_DATA_TYPES.items() if key in schema} + expected_rows = {key: value for key, value in expected_row.items() if key in schema} # prepare date to be compared: convert into pendulum instance, adjust microsecond precision if "col4" in expected_rows: parsed_date = pendulum.instance(db_mapping["col4"]) diff --git a/tests/common/schema/test_inference.py b/tests/common/schema/test_inference.py index e2821d5626..1540d8a74a 100644 --- a/tests/common/schema/test_inference.py +++ b/tests/common/schema/test_inference.py @@ -7,11 +7,12 @@ from dlt.common import Wei, Decimal, pendulum, json from dlt.common.json import custom_pua_decode from dlt.common.schema import Schema, utils -from dlt.common.schema.typing import TSimpleRegex +from dlt.common.schema.typing import TSimpleRegex, TTableSchemaColumns from dlt.common.schema.exceptions import ( CannotCoerceColumnException, CannotCoerceNullException, ParentTableNotFoundException, + SchemaCorruptedException, TablePropertiesConflictException, ) from tests.common.utils import load_json_case @@ -584,3 +585,59 @@ def test_update_table_adds_at_end(schema: Schema) -> None: table = schema.tables["eth"] # place new columns at the end assert list(table["columns"].keys()) == ["evm", "_dlt_load_id"] + + +def test_get_new_columns(schema: Schema) -> None: + # allow for casing in names + os.environ["SCHEMA__NAMING"] = "direct" + schema.update_normalizers() + + empty_table = utils.new_table("events") + schema.update_table(empty_table) + assert schema.get_new_table_columns("events", {}, case_sensitive=True) == [] + name_column = utils.new_column("name", "text") + id_column = utils.new_column("ID", "text") + existing_columns: TTableSchemaColumns = { + "id": id_column, + "name": name_column, + } + # no new columns + assert schema.get_new_table_columns("events", existing_columns, case_sensitive=True) == [] + # one new column + address_column = utils.new_column("address", "complex") + schema.update_table(utils.new_table("events", columns=[address_column])) + assert schema.get_new_table_columns("events", existing_columns, case_sensitive=True) == [ + address_column + ] + assert schema.get_new_table_columns("events", existing_columns, case_sensitive=False) == [ + address_column + ] + # name is already present + schema.update_table(utils.new_table("events", columns=[name_column])) + # so it is not detected + assert schema.get_new_table_columns("events", existing_columns, case_sensitive=True) == [ + address_column + ] + assert schema.get_new_table_columns("events", existing_columns, case_sensitive=False) == [ + address_column + ] + # id is added with different casing + ID_column = utils.new_column("ID", "text") + schema.update_table(utils.new_table("events", columns=[ID_column])) + # case sensitive will detect + assert schema.get_new_table_columns("events", existing_columns, case_sensitive=True) == [ + address_column, + ID_column, + ] + # insensitive doesn't + assert schema.get_new_table_columns("events", existing_columns, case_sensitive=False) == [ + address_column + ] + + # existing columns are case sensitive + existing_columns["ID"] = ID_column + assert schema.get_new_table_columns("events", existing_columns, case_sensitive=True) == [ + address_column + ] + with pytest.raises(SchemaCorruptedException): + schema.get_new_table_columns("events", existing_columns, case_sensitive=False) diff --git a/tests/common/test_destination.py b/tests/common/test_destination.py index 2c690d94bb..e6e2ecad2c 100644 --- a/tests/common/test_destination.py +++ b/tests/common/test_destination.py @@ -6,6 +6,7 @@ from dlt.common.destination.exceptions import InvalidDestinationReference, UnknownDestinationModule from dlt.common.schema import Schema from dlt.common.typing import is_subclass +from dlt.common.normalizers.naming import sql_ci_v1, sql_cs_v1 from tests.common.configuration.utils import environment from tests.utils import ACTIVE_DESTINATIONS @@ -156,6 +157,52 @@ def test_import_all_destinations() -> None: assert isinstance(dest.capabilities(), DestinationCapabilitiesContext) +def test_base_adjust_capabilities() -> None: + # return without modifications + caps = DestinationCapabilitiesContext.generic_capabilities() + caps_props = dict(caps) + adj_caps = Destination.adjust_capabilities(caps, None, None) + assert caps is adj_caps + assert dict(adj_caps) == caps_props + + # caps that support case sensitive idents may be put into case sensitive mode + caps = DestinationCapabilitiesContext.generic_capabilities() + assert caps.has_case_sensitive_identifiers is True + assert caps.casefold_identifier is str + # this one is already in case sensitive mode + assert caps.generates_case_sensitive_identifiers() is True + # applying cs naming has no effect + caps = Destination.adjust_capabilities(caps, None, sql_cs_v1.NamingConvention()) + assert caps.generates_case_sensitive_identifiers() is True + # same for ci naming, adjustment is only from case insensitive to sensitive + caps = Destination.adjust_capabilities(caps, None, sql_ci_v1.NamingConvention()) + assert caps.generates_case_sensitive_identifiers() is True + + # switch to case sensitive if supported by changing case folding function + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.casefold_identifier = str.lower + assert caps.generates_case_sensitive_identifiers() is False + caps = Destination.adjust_capabilities(caps, None, sql_cs_v1.NamingConvention()) + assert caps.casefold_identifier is str + assert caps.generates_case_sensitive_identifiers() is True + # ci naming has no effect + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.casefold_identifier = str.upper + caps = Destination.adjust_capabilities(caps, None, sql_ci_v1.NamingConvention()) + assert caps.casefold_identifier is str.upper + assert caps.generates_case_sensitive_identifiers() is False + + # this one does not support case sensitive identifiers and is casefolding + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.has_case_sensitive_identifiers = False + caps.casefold_identifier = str.lower + assert caps.generates_case_sensitive_identifiers() is False + caps = Destination.adjust_capabilities(caps, None, sql_cs_v1.NamingConvention()) + # no effect + assert caps.casefold_identifier is str.lower + assert caps.generates_case_sensitive_identifiers() is False + + def test_instantiate_all_factories() -> None: from dlt import destinations @@ -305,6 +352,43 @@ def test_normalize_dataset_name() -> None: ) +def test_normalize_staging_dataset_name() -> None: + # default normalized staging dataset + assert ( + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name="Dataset", default_schema_name="default") + .normalize_staging_dataset_name(Schema("private")) + == "dataset_private_staging" + ) + # different layout + assert ( + DestinationClientDwhConfiguration(staging_dataset_name_layout="%s__STAGING") + ._bind_dataset_name(dataset_name="Dataset", default_schema_name="private") + .normalize_staging_dataset_name(Schema("private")) + == "dataset_staging" + ) + # without placeholder + assert ( + DestinationClientDwhConfiguration(staging_dataset_name_layout="static_staging") + ._bind_dataset_name(dataset_name="Dataset", default_schema_name="default") + .normalize_staging_dataset_name(Schema("private")) + == "static_staging" + ) + # empty dataset -> empty staging + assert ( + DestinationClientDwhConfiguration() + ._bind_dataset_name(dataset_name=None, default_schema_name="private") + .normalize_staging_dataset_name(Schema("private")) + is None + ) + assert ( + DestinationClientDwhConfiguration(staging_dataset_name_layout="static_staging") + ._bind_dataset_name(dataset_name=None, default_schema_name="default") + .normalize_staging_dataset_name(Schema("private")) + == "static_staging" + ) + + def test_normalize_dataset_name_none_default_schema() -> None: # if default schema is None, suffix is not added assert ( diff --git a/tests/load/bigquery/test_bigquery_table_builder.py b/tests/load/bigquery/test_bigquery_table_builder.py index 66ea4a319f..63ac645113 100644 --- a/tests/load/bigquery/test_bigquery_table_builder.py +++ b/tests/load/bigquery/test_bigquery_table_builder.py @@ -1,6 +1,8 @@ import os from copy import deepcopy from typing import Iterator, Dict, Any, List +from dlt.common.destination.exceptions import DestinationSchemaTampered +from dlt.common.schema.exceptions import SchemaIdentifierNormalizationCollision from dlt.destinations.impl.bigquery.bigquery_adapter import ( PARTITION_HINT, CLUSTER_HINT, @@ -18,7 +20,7 @@ GcpServiceAccountCredentials, ) from dlt.common.pendulum import pendulum -from dlt.common.schema import Schema +from dlt.common.schema import Schema, utils from dlt.common.utils import custom_environ from dlt.common.utils import uniq_id @@ -60,15 +62,30 @@ def test_configuration() -> None: @pytest.fixture def gcp_client(empty_schema: Schema) -> BigQueryClient: + return create_client(empty_schema) + + +@pytest.fixture +def ci_gcp_client(empty_schema: Schema) -> BigQueryClient: + empty_schema._normalizers_config["names"] = "tests.common.cases.normalizers.title_case" + empty_schema.update_normalizers() + # make the destination case insensitive + return create_client(empty_schema, has_case_sensitive_identifiers=False) + + +def create_client(schema: Schema, has_case_sensitive_identifiers: bool = True) -> BigQueryClient: # return a client without opening connection creds = GcpServiceAccountCredentials() creds.project_id = "test_project_id" # noinspection PydanticTypeChecker return bigquery().client( - empty_schema, - BigQueryClientConfiguration(credentials=creds)._bind_dataset_name( - dataset_name=f"test_{uniq_id()}" - ), + schema, + BigQueryClientConfiguration( + credentials=creds, + has_case_sensitive_identifiers=has_case_sensitive_identifiers, + # let modify destination caps + should_set_case_sensitivity_on_new_dataset=True, + )._bind_dataset_name(dataset_name=f"test_{uniq_id()}"), ) @@ -150,6 +167,47 @@ def test_alter_table(gcp_client: BigQueryClient) -> None: assert "ADD COLUMN `col2` FLOAT64 NOT NULL" in sql +def test_create_table_case_insensitive(ci_gcp_client: BigQueryClient) -> None: + # in case insensitive mode + assert ci_gcp_client.capabilities.has_case_sensitive_identifiers is False + # case sensitive naming convention + assert ci_gcp_client.sql_client.dataset_name.startswith("Test") + with ci_gcp_client.with_staging_dataset(): + assert ci_gcp_client.sql_client.dataset_name.endswith("staginG") + assert ci_gcp_client.sql_client.staging_dataset_name.endswith("staginG") + + ci_gcp_client.schema.update_table( + utils.new_table("event_test_table", columns=deepcopy(TABLE_UPDATE)) + ) + sql = ci_gcp_client._get_table_update_sql( + "Event_test_tablE", + list(ci_gcp_client.schema.get_table_columns("Event_test_tablE").values()), + False, + )[0] + sqlfluff.parse(sql, dialect="bigquery") + # everything capitalized + + # every line starts with "Col" + for line in sql.split("\n")[1:]: + assert line.startswith("`Col") + + # generate collision + ci_gcp_client.schema.update_table( + utils.new_table("event_TEST_table", columns=deepcopy(TABLE_UPDATE)) + ) + assert "Event_TEST_tablE" in ci_gcp_client.schema.tables + with pytest.raises(SchemaIdentifierNormalizationCollision) as coll_ex: + ci_gcp_client.update_stored_schema([]) + assert coll_ex.value.conflict_identifier_name == "Event_test_tablE" + assert coll_ex.value.table_name == "Event_TEST_tablE" + + # make it case sensitive + ci_gcp_client.capabilities.has_case_sensitive_identifiers = True + # now the check passes, we are stopped because it is not allowed to change schema in the loader + with pytest.raises(DestinationSchemaTampered): + ci_gcp_client.update_stored_schema([]) + + def test_create_table_with_partition_and_cluster(gcp_client: BigQueryClient) -> None: mod_update = deepcopy(TABLE_UPDATE) # timestamp diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index a35dfa7654..ffee515b90 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -1052,7 +1052,7 @@ def table_3(make_data=False): if job_client.should_load_data_to_staging_dataset( job_client.schema.tables[table_name] ): - with client.with_staging_dataset(staging=True): + with client.with_staging_dataset(): tab_name = client.make_qualified_table_name(table_name) with client.execute_query(f"SELECT * FROM {tab_name}") as cur: assert len(cur.fetchall()) == 0 diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 5ba68be67c..86bd67db9a 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -46,6 +46,14 @@ def create_client(empty_schema: Schema) -> PostgresClient: def test_create_table(client: PostgresClient) -> None: + # make sure we are in case insensitive mode + assert client.capabilities.generates_case_sensitive_identifiers() is False + # check if dataset name is properly folded + assert client.sql_client.dataset_name == client.config.dataset_name # identical to config + assert ( + client.sql_client.staging_dataset_name + == client.config.staging_dataset_name_layout % client.config.dataset_name + ) # non existing table sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)[0] sqlfluff.parse(sql, dialect="postgres") @@ -143,6 +151,14 @@ def test_create_table_with_hints(client: PostgresClient, empty_schema: Schema) - def test_create_table_case_sensitive(cs_client: PostgresClient) -> None: + # did we switch to case sensitive + assert cs_client.capabilities.generates_case_sensitive_identifiers() is True + # check dataset names + assert cs_client.sql_client.dataset_name.startswith("Test") + with cs_client.with_staging_dataset(): + assert cs_client.sql_client.dataset_name.endswith("staginG") + assert cs_client.sql_client.staging_dataset_name.endswith("staginG") + # check tables cs_client.schema.update_table( utils.new_table("event_test_table", columns=deepcopy(TABLE_UPDATE)) ) diff --git a/tests/load/redshift/test_redshift_table_builder.py b/tests/load/redshift/test_redshift_table_builder.py index de6f450134..37ca20232d 100644 --- a/tests/load/redshift/test_redshift_table_builder.py +++ b/tests/load/redshift/test_redshift_table_builder.py @@ -3,7 +3,7 @@ from copy import deepcopy from dlt.common.utils import uniq_id, custom_environ, digest128 -from dlt.common.schema import Schema +from dlt.common.schema import Schema, utils from dlt.common.configuration import resolve_configuration from dlt.destinations import redshift @@ -21,12 +21,25 @@ @pytest.fixture def client(empty_schema: Schema) -> RedshiftClient: + return create_client(empty_schema) + + +@pytest.fixture +def cs_client(empty_schema: Schema) -> RedshiftClient: + empty_schema._normalizers_config["names"] = "tests.common.cases.normalizers.title_case" + empty_schema.update_normalizers() + # make the destination case sensitive + return create_client(empty_schema, has_case_sensitive_identifiers=True) + + +def create_client(schema: Schema, has_case_sensitive_identifiers: bool = False) -> RedshiftClient: # return client without opening connection return redshift().client( - empty_schema, - RedshiftClientConfiguration(credentials=RedshiftCredentials())._bind_dataset_name( - dataset_name="test_" + uniq_id() - ), + schema, + RedshiftClientConfiguration( + credentials=RedshiftCredentials(), + has_case_sensitive_identifiers=has_case_sensitive_identifiers, + )._bind_dataset_name(dataset_name="test_" + uniq_id()), ) @@ -55,6 +68,7 @@ def test_redshift_configuration() -> None: def test_create_table(client: RedshiftClient) -> None: + assert client.capabilities.generates_case_sensitive_identifiers() is False # non existing table sql = ";".join(client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)) sqlfluff.parse(sql, dialect="redshift") @@ -105,6 +119,29 @@ def test_alter_table(client: RedshiftClient) -> None: assert '"col11_precision" time without time zone NOT NULL' in sql +def test_create_table_case_sensitive(cs_client: RedshiftClient) -> None: + # did we switch to case sensitive + assert cs_client.capabilities.generates_case_sensitive_identifiers() is True + # check dataset names + assert cs_client.sql_client.dataset_name.startswith("Test") + + # check tables + cs_client.schema.update_table( + utils.new_table("event_test_table", columns=deepcopy(TABLE_UPDATE)) + ) + sql = cs_client._get_table_update_sql( + "Event_test_tablE", + list(cs_client.schema.get_table_columns("Event_test_tablE").values()), + False, + )[0] + sqlfluff.parse(sql, dialect="redshift") + # everything capitalized + assert cs_client.sql_client.fully_qualified_dataset_name(escape=False)[0] == "T" # Test + # every line starts with "Col" + for line in sql.split("\n")[1:]: + assert line.startswith('"Col') + + def test_create_table_with_hints(client: RedshiftClient) -> None: mod_update = deepcopy(TABLE_UPDATE) # timestamp diff --git a/tests/load/snowflake/test_snowflake_table_builder.py b/tests/load/snowflake/test_snowflake_table_builder.py index 4bb69085da..1fc0034f43 100644 --- a/tests/load/snowflake/test_snowflake_table_builder.py +++ b/tests/load/snowflake/test_snowflake_table_builder.py @@ -4,7 +4,7 @@ import sqlfluff from dlt.common.utils import uniq_id -from dlt.common.schema import Schema +from dlt.common.schema import Schema, utils from dlt.destinations import snowflake from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient from dlt.destinations.impl.snowflake.configuration import ( @@ -18,12 +18,24 @@ pytestmark = pytest.mark.essential +@pytest.fixture +def cs_client(empty_schema: Schema) -> SnowflakeClient: + # change normalizer to case sensitive + empty_schema._normalizers_config["names"] = "tests.common.cases.normalizers.title_case" + empty_schema.update_normalizers() + return create_client(empty_schema) + + @pytest.fixture def snowflake_client(empty_schema: Schema) -> SnowflakeClient: + return create_client(empty_schema) + + +def create_client(schema: Schema) -> SnowflakeClient: # return client without opening connection creds = SnowflakeCredentials() return snowflake().client( - empty_schema, + schema, SnowflakeClientConfiguration(credentials=creds)._bind_dataset_name( dataset_name="test_" + uniq_id() ), @@ -31,6 +43,22 @@ def snowflake_client(empty_schema: Schema) -> SnowflakeClient: def test_create_table(snowflake_client: SnowflakeClient) -> None: + # make sure we are in case insensitive mode + assert snowflake_client.capabilities.generates_case_sensitive_identifiers() is False + # check if dataset name is properly folded + assert ( + snowflake_client.sql_client.fully_qualified_dataset_name(escape=False) + == snowflake_client.config.dataset_name.upper() + ) + with snowflake_client.sql_client.with_staging_dataset(): + assert ( + snowflake_client.sql_client.fully_qualified_dataset_name(escape=False) + == ( + snowflake_client.config.staging_dataset_name_layout + % snowflake_client.config.dataset_name + ).upper() + ) + statements = snowflake_client._get_table_update_sql("event_test_table", TABLE_UPDATE, False) assert len(statements) == 1 sql = statements[0] @@ -81,6 +109,31 @@ def test_alter_table(snowflake_client: SnowflakeClient) -> None: assert '"COL2" FLOAT NOT NULL' in sql +def test_create_table_case_sensitive(cs_client: SnowflakeClient) -> None: + # did we switch to case sensitive + assert cs_client.capabilities.generates_case_sensitive_identifiers() is True + # check dataset names + assert cs_client.sql_client.dataset_name.startswith("Test") + with cs_client.with_staging_dataset(): + assert cs_client.sql_client.dataset_name.endswith("staginG") + assert cs_client.sql_client.staging_dataset_name.endswith("staginG") + # check tables + cs_client.schema.update_table( + utils.new_table("event_test_table", columns=deepcopy(TABLE_UPDATE)) + ) + sql = cs_client._get_table_update_sql( + "Event_test_tablE", + list(cs_client.schema.get_table_columns("Event_test_tablE").values()), + False, + )[0] + sqlfluff.parse(sql, dialect="snowflake") + # everything capitalized + assert cs_client.sql_client.fully_qualified_dataset_name(escape=False)[0] == "T" # Test + # every line starts with "Col" + for line in sql.split("\n")[1:]: + assert line.startswith('"Col') + + def test_create_table_with_partition_and_cluster(snowflake_client: SnowflakeClient) -> None: mod_update = deepcopy(TABLE_UPDATE) # timestamp diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 69f6bd4cc4..614eb17da1 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -8,6 +8,7 @@ from typing import Iterator, Tuple, List, Dict, Any from dlt.common import json, pendulum +from dlt.common.normalizers.naming import NamingConvention from dlt.common.schema import Schema from dlt.common.schema.typing import ( LOADS_TABLE_NAME, @@ -48,6 +49,11 @@ # mark all tests as essential, do not remove pytestmark = pytest.mark.essential +TEST_NAMING_CONVENTIONS = ( + "snake_case", + "tests.common.cases.normalizers.sql_upper", + "tests.common.cases.normalizers.title_case", +) @pytest.fixture @@ -56,10 +62,20 @@ def file_storage() -> FileStorage: @pytest.fixture(scope="function") -def client(request) -> Iterator[SqlJobClientBase]: +def client(request, naming) -> Iterator[SqlJobClientBase]: yield from yield_client_with_storage(request.param.destination) +@pytest.fixture(scope="function") +def naming(request) -> str: + # NOTE: this fixture is forced by `client` fixture which requires it goes first + # so sometimes there's no request available + if hasattr(request, "param"): + os.environ["SCHEMA__NAMING"] = request.param + return request.param + return None + + @pytest.mark.order(1) @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name @@ -69,13 +85,15 @@ def test_initialize_storage(client: SqlJobClientBase) -> None: @pytest.mark.order(2) +@pytest.mark.parametrize("naming", TEST_NAMING_CONVENTIONS, indirect=True) @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) -def test_get_schema_on_empty_storage(client: SqlJobClientBase) -> None: +def test_get_schema_on_empty_storage(naming: str, client: SqlJobClientBase) -> None: # test getting schema on empty dataset without any tables - table_name, table_columns = list(client.get_storage_tables([VERSION_TABLE_NAME]))[0] - assert table_name == VERSION_TABLE_NAME + version_table_name = client.schema.version_table_name + table_name, table_columns = list(client.get_storage_tables([version_table_name]))[0] + assert table_name == version_table_name assert len(table_columns) == 0 schema_info = client.get_stored_schema() assert schema_info is None @@ -167,14 +185,17 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: assert this_schema == newest_schema +@pytest.mark.parametrize("naming", TEST_NAMING_CONVENTIONS, indirect=True) @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) -def test_complete_load(client: SqlJobClientBase) -> None: +def test_complete_load(naming: str, client: SqlJobClientBase) -> None: + loads_table_name = client.schema.loads_table_name + version_table_name = client.schema.version_table_name client.update_stored_schema() load_id = "182879721.182912" client.complete_load(load_id) - load_table = client.sql_client.make_qualified_table_name(LOADS_TABLE_NAME) + load_table = client.sql_client.make_qualified_table_name(loads_table_name) load_rows = list(client.sql_client.execute_sql(f"SELECT * FROM {load_table}")) assert len(load_rows) == 1 assert load_rows[0][0] == load_id @@ -185,10 +206,13 @@ def test_complete_load(client: SqlJobClientBase) -> None: assert type(load_rows[0][3]) is datetime.datetime assert load_rows[0][4] == client.schema.version_hash # make sure that hash in loads exists in schema versions table - versions_table = client.sql_client.make_qualified_table_name(VERSION_TABLE_NAME) + versions_table = client.sql_client.make_qualified_table_name(version_table_name) + version_hash_column = client.sql_client.escape_column_name( + client.schema.naming.normalize_identifier("version_hash") + ) version_rows = list( client.sql_client.execute_sql( - f"SELECT * FROM {versions_table} WHERE version_hash = %s", load_rows[0][4] + f"SELECT * FROM {versions_table} WHERE {version_hash_column} = %s", load_rows[0][4] ) ) assert len(version_rows) == 1 @@ -453,10 +477,11 @@ def _assert_columns_order(sql_: str) -> None: _assert_columns_order(sql) +@pytest.mark.parametrize("naming", TEST_NAMING_CONVENTIONS, indirect=True) @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) -def test_data_writer_load(client: SqlJobClientBase, file_storage: FileStorage) -> None: +def test_data_writer_load(naming: str, client: SqlJobClientBase, file_storage: FileStorage) -> None: if not client.capabilities.preferred_loader_file_format: pytest.skip("preferred loader file format not set, destination will only work with staging") rows, table_name = prepare_schema(client, "simple_row") @@ -474,8 +499,10 @@ def test_data_writer_load(client: SqlJobClientBase, file_storage: FileStorage) - write_dataset(client, f, [rows[1]], client.schema.get_table(table_name)["columns"]) query = f.getvalue().decode() expect_load_file(client, file_storage, query, table_name) + f_int_name = client.schema.naming.normalize_identifier("f_int") + f_int_name_quoted = client.sql_client.escape_column_name(f_int_name) db_row = client.sql_client.execute_sql( - f"SELECT * FROM {canonical_name} WHERE f_int = {rows[1]['f_int']}" + f"SELECT * FROM {canonical_name} WHERE {f_int_name_quoted} = {rows[1][f_int_name]}" )[0] assert db_row[3] is None assert db_row[5] is None @@ -521,53 +548,68 @@ def test_data_writer_string_escape_edge( assert row_value == expected +@pytest.mark.parametrize("naming", TEST_NAMING_CONVENTIONS, indirect=True) @pytest.mark.parametrize("write_disposition", ["append", "replace"]) @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) def test_load_with_all_types( - client: SqlJobClientBase, write_disposition: TWriteDisposition, file_storage: FileStorage + naming: str, + client: SqlJobClientBase, + write_disposition: TWriteDisposition, + file_storage: FileStorage, ) -> None: if not client.capabilities.preferred_loader_file_format: pytest.skip("preferred loader file format not set, destination will only work with staging") table_name = "event_test_table" + uniq_id() - column_schemas, data_types = table_update_and_row( + column_schemas, data_row = table_update_and_row( exclude_types=( ["time"] if client.config.destination_type in ["databricks", "clickhouse"] else None ), ) # we should have identical content with all disposition types - client.schema.update_table( + partial = client.schema.update_table( new_table( table_name, write_disposition=write_disposition, columns=list(column_schemas.values()) ) ) + # get normalized schema + table_name = partial["name"] + column_schemas = partial["columns"] + normalize_rows([data_row], client.schema.naming) client.schema._bump_version() client.update_stored_schema() - if client.should_load_data_to_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] + should_load_to_staging = client.should_load_data_to_staging_dataset(client.schema.tables[table_name]) # type: ignore[attr-defined] + if should_load_to_staging: with client.with_staging_dataset(): # type: ignore[attr-defined] # create staging for merge dataset client.initialize_storage() client.update_stored_schema() - with client.sql_client.with_staging_dataset( - client.should_load_data_to_staging_dataset(client.schema.tables[table_name]) # type: ignore[attr-defined] + with client.sql_client.with_alternative_dataset_name( + client.sql_client.staging_dataset_name + if should_load_to_staging + else client.sql_client.dataset_name ): canonical_name = client.sql_client.make_qualified_table_name(table_name) # write row with io.BytesIO() as f: - write_dataset(client, f, [data_types], column_schemas) + write_dataset(client, f, [data_row], column_schemas) query = f.getvalue().decode() expect_load_file(client, file_storage, query, table_name) db_row = list(client.sql_client.execute_sql(f"SELECT * FROM {canonical_name}")[0]) - # content must equal - assert_all_data_types_row( - db_row, - schema=column_schemas, - allow_base64_binary=client.config.destination_type in ["clickhouse"], - ) + assert len(db_row) == len(data_row) + # assert_all_data_types_row has many hardcoded columns so for now skip that part + if naming == "snake_case": + # content must equal + assert_all_data_types_row( + db_row, + data_row, + schema=column_schemas, + allow_base64_binary=client.config.destination_type in ["clickhouse"], + ) @pytest.mark.parametrize( @@ -650,7 +692,7 @@ def test_write_dispositions( # merge on client level, without loader, loads to staging dataset. so this table is empty assert len(db_rows) == 0 # check staging - with client.sql_client.with_staging_dataset(staging=True): + with client.sql_client.with_staging_dataset(): db_rows = list( client.sql_client.execute_sql( f"SELECT * FROM {client.sql_client.make_qualified_table_name(t)} ORDER" @@ -869,10 +911,19 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: def prepare_schema(client: SqlJobClientBase, case: str) -> Tuple[List[Dict[str, Any]], str]: client.update_stored_schema() rows = load_json_case(case) + # normalize rows + normalize_rows(rows, client.schema.naming) # use first row to infer table table: TTableSchemaColumns = {k: client.schema._infer_column(k, v) for k, v in rows[0].items()} table_name = f"event_{case}_{uniq_id()}" - client.schema.update_table(new_table(table_name, columns=list(table.values()))) + partial = client.schema.update_table(new_table(table_name, columns=list(table.values()))) client.schema._bump_version() client.update_stored_schema() - return rows, table_name + # return normalized name + return rows, partial["name"] + + +def normalize_rows(rows: List[Dict[str, Any]], naming: NamingConvention) -> None: + for row in rows: + for k in list(row.keys()): + row[naming.normalize_identifier(k)] = row.pop(k) diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 8d4e146034..e167f0ceda 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -1,3 +1,4 @@ +import os import pytest import datetime # noqa: I251 from typing import Iterator, Any @@ -31,6 +32,11 @@ # mark all tests as essential, do not remove pytestmark = pytest.mark.essential +TEST_NAMING_CONVENTIONS = ( + "snake_case", + "tests.common.cases.normalizers.sql_upper", + "tests.common.cases.normalizers.title_case", +) @pytest.fixture @@ -39,10 +45,20 @@ def file_storage() -> FileStorage: @pytest.fixture(scope="function") -def client(request) -> Iterator[SqlJobClientBase]: +def client(request, naming) -> Iterator[SqlJobClientBase]: yield from yield_client_with_storage(request.param.destination) +@pytest.fixture(scope="function") +def naming(request) -> str: + # NOTE: this fixture is forced by `client` fixture which requires it goes first + # so sometimes there's no request available + if hasattr(request, "param"): + os.environ["SCHEMA__NAMING"] = request.param + return request.param + return None + + @pytest.mark.parametrize( "client", destinations_configs( @@ -112,20 +128,22 @@ def test_malformed_query_parameters(client: SqlJobClientBase) -> None: assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) +@pytest.mark.parametrize("naming", TEST_NAMING_CONVENTIONS, indirect=True) @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) -def test_has_dataset(client: SqlJobClientBase) -> None: +def test_has_dataset(naming: str, client: SqlJobClientBase) -> None: with client.sql_client.with_alternative_dataset_name("not_existing"): assert not client.sql_client.has_dataset() client.update_stored_schema() assert client.sql_client.has_dataset() +@pytest.mark.parametrize("naming", TEST_NAMING_CONVENTIONS, indirect=True) @pytest.mark.parametrize( "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name ) -def test_create_drop_dataset(client: SqlJobClientBase) -> None: +def test_create_drop_dataset(naming: str, client: SqlJobClientBase) -> None: # client.sql_client.create_dataset() with pytest.raises(DatabaseException): client.sql_client.create_dataset() @@ -514,7 +532,10 @@ def test_transaction_isolation(client: SqlJobClientBase) -> None: def test_thread(thread_id: Decimal) -> None: # make a copy of the sql_client thread_client = client.sql_client.__class__( - client.sql_client.dataset_name, client.sql_client.credentials, client.capabilities + client.sql_client.dataset_name, + client.sql_client.staging_dataset_name, + client.sql_client.credentials, + client.capabilities, ) with thread_client: with thread_client.begin_transaction(): diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index 979bdd0e37..319055184a 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -122,7 +122,10 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: sections=("destination", "duckdb"), ) with DuckDbSqlClient( - GITHUB_DATASET, duckdb_cfg.credentials, duckdb().capabilities() + GITHUB_DATASET, + "%s_staging", + duckdb_cfg.credentials, + duckdb().capabilities(), ) as client: rows = client.execute_sql(f"SELECT * FROM {LOADS_TABLE_NAME}") # make sure we have just 4 columns @@ -175,7 +178,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: assert "_version_hash" in state_dict with DuckDbSqlClient( - GITHUB_DATASET, duckdb_cfg.credentials, duckdb().capabilities() + GITHUB_DATASET, "%s_staging", duckdb_cfg.credentials, duckdb().capabilities() ) as client: rows = client.execute_sql( f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at" @@ -316,7 +319,7 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: sections=("destination", "duckdb"), ) with DuckDbSqlClient( - GITHUB_DATASET, duckdb_cfg.credentials, duckdb().capabilities() + GITHUB_DATASET, "%s_staging", duckdb_cfg.credentials, duckdb().capabilities() ) as client: rows = client.execute_sql("SELECT * FROM issues") assert len(rows) == 70 diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 328119970a..a267d3106d 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -2442,6 +2442,32 @@ def test_import_unknown_file_format() -> None: assert isinstance(inner_ex.__cause__, ValueError) +def test_static_staging_dataset() -> None: + # share database and staging dataset + duckdb_ = dlt.destinations.duckdb( + "_storage/test_static_staging_dataset.db", staging_dataset_name_layout="_dlt_staging" + ) + + pipeline_1 = dlt.pipeline("test_static_staging_dataset_1", destination=duckdb_, dev_mode=True) + pipeline_2 = dlt.pipeline("test_static_staging_dataset_2", destination=duckdb_, dev_mode=True) + # staging append (without primary key) + info = pipeline_1.run([1, 2, 3], table_name="digits", write_disposition="merge") + assert_load_info(info) + info = pipeline_2.run(["a", "b", "c", "d"], table_name="letters", write_disposition="merge") + assert_load_info(info) + with pipeline_1.sql_client() as client: + with client.with_alternative_dataset_name("_dlt_staging"): + assert client.has_dataset() + schemas = client.execute_sql("SELECT schema_name FROM _dlt_staging._dlt_version") + assert {s[0] for s in schemas} == { + "test_static_staging_dataset_1", + "test_static_staging_dataset_2", + } + + assert_data_table_counts(pipeline_1, {"digits": 3}) + assert_data_table_counts(pipeline_2, {"letters": 4}) + + def assert_imported_file( pipeline: Pipeline, table_name: str, From 20f6b048d28ed0d7bdda3ec6e330bcbf093a9ea4 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Mon, 8 Jul 2024 15:41:14 +0200 Subject: [PATCH 61/61] adds examples and step by step explanation for refresh modes (#1560) --- docs/website/docs/general-usage/pipeline.md | 77 ++++++++++++++++----- 1 file changed, 58 insertions(+), 19 deletions(-) diff --git a/docs/website/docs/general-usage/pipeline.md b/docs/website/docs/general-usage/pipeline.md index d1f82f970a..f21d6f0686 100644 --- a/docs/website/docs/general-usage/pipeline.md +++ b/docs/website/docs/general-usage/pipeline.md @@ -98,33 +98,72 @@ You can reset parts or all of your sources by using the `refresh` argument to `d That means when you run the pipeline the sources/resources being processed will have their state reset and their tables either dropped or truncated depending on which refresh mode is used. +`refresh` option works with all relational/sql destinations and file buckets (`filesystem`). it does not work with vector databases (we are working on that) and +with custom destinations. + The `refresh` argument should have one of the following string values to decide the refresh mode: -* `drop_sources` - All sources being processed in `pipeline.run` or `pipeline.extract` are refreshed. - That means all tables listed in their schemas are dropped and state belonging to those sources and all their resources is completely wiped. - The tables are deleted both from pipeline's schema and from the destination database. +### Drop tables and pipeline state for a source with `drop_sources` +All sources being processed in `pipeline.run` or `pipeline.extract` are refreshed. +That means all tables listed in their schemas are dropped and state belonging to those sources and all their resources is completely wiped. +The tables are deleted both from pipeline's schema and from the destination database. - If you only have one source or run with all your sources together, then this is practically like running the pipeline again for the first time +If you only have one source or run with all your sources together, then this is practically like running the pipeline again for the first time - :::caution - This erases schema history for the selected sources and only the latest version is stored - :::: +:::caution +This erases schema history for the selected sources and only the latest version is stored +::: -* `drop_resources` - Limits the refresh to the resources being processed in `pipeline.run` or `pipeline.extract` (.e.g by using `source.with_resources(...)`). - Tables belonging to those resources are dropped and their resource state is wiped (that includes incremental state). - The tables are deleted both from pipeline's schema and from the destination database. +```py +import dlt - Source level state keys are not deleted in this mode (i.e. `dlt.state()[<'my_key>'] = ''`) +pipeline = dlt.pipeline("airtable_demo", destination="duckdb") +pipeline.run(airtable_emojis(), refresh="drop_sources") +``` +In example above we instruct `dlt` to wipe pipeline state belonging to `airtable_emojis` source and drop all the database tables in `duckdb` to +which data was loaded. The `airtable_emojis` source had two resources named "📆 Schedule" and "💰 Budget" loading to tables "_schedule" and "_budget". Here's +what `dlt` does step by step: +1. collects a list of tables to drop by looking for all the tables in the schema that are created in the destination. +2. removes existing pipeline state associated with `airtable_emojis` source +3. resets the schema associated with `airtable_emojis` source +4. executes `extract` and `normalize` steps. those will create fresh pipeline state and a schema +5. before it executes `load` step, the collected tables are dropped from staging and regular dataset +6. schema `airtable_emojis` (associated with the source) is be removed from `_dlt_version` table +7. executes `load` step as usual so tables are re-created and fresh schema and pipeline state are stored. + +### Selectively drop tables and resource state with `drop_resources` +Limits the refresh to the resources being processed in `pipeline.run` or `pipeline.extract` (.e.g by using `source.with_resources(...)`). +Tables belonging to those resources are dropped and their resource state is wiped (that includes incremental state). +The tables are deleted both from pipeline's schema and from the destination database. + +Source level state keys are not deleted in this mode (i.e. `dlt.state()[<'my_key>'] = ''`) + +:::caution +This erases schema history for all affected sources and only the latest schema version is stored. +::: - :::caution - This erases schema history for all affected schemas and only the latest schema version is stored - :::: +```py +import dlt -* `drop_data` - Same as `drop_resources` but instead of dropping tables from schema only the data is deleted from them (i.e. by `TRUNCATE ` in sql destinations). Resource state for selected resources is also wiped. - The schema remains unmodified in this case. +pipeline = dlt.pipeline("airtable_demo", destination="duckdb") +pipeline.run(airtable_emojis().with_resources("📆 Schedule"), refresh="drop_resources") +``` +Above we request that the state associated with "📆 Schedule" resource is reset and the table generated by it ("_schedule") is dropped. Other resources, +tables and state are not affected. Please check `drop_sources` for step by step description of what `dlt` does internally. + +### Selectively truncate tables and reset resource state with `drop_data` +Same as `drop_resources` but instead of dropping tables from schema only the data is deleted from them (i.e. by `TRUNCATE ` in sql destinations). Resource state for selected resources is also wiped. In case of [incremental resources](incremental-loading.md#incremental-loading-with-a-cursor-field) this will +reset the cursor state and fully reload the data from the `initial_value`. + +The schema remains unmodified in this case. +```py +import dlt + +pipeline = dlt.pipeline("airtable_demo", destination="duckdb") +pipeline.run(airtable_emojis().with_resources("📆 Schedule"), refresh="drop_data") +``` +Above the incremental state of the "📆 Schedule" is reset before `extract` step so data is fully reacquired. Just before `load` step starts, + the "_schedule" is truncated and new (full) table data will be inserted/copied. ## Display the loading progress