From 3e33b4967eac2cb4e6725eac52336fe0e319de59 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Fri, 12 Jul 2024 11:05:42 +0200 Subject: [PATCH] add support for minio, r2, gcs --- .github/workflows/LocalTesting.yml | 87 ++++++++++++++ extension-ci-tools | 2 +- extension_config.cmake | 7 ++ scripts/create_minio_credential_file.sh | 43 +++++++ scripts/upload_test_files_to_minio.sh | 4 + src/functions/delta_scan.cpp | 75 +++++++++++- test/sql/cloud/minio_local/gcs_r2.test | 93 +++++++++++++++ test/sql/cloud/minio_local/minio_local.test | 121 ++++++++++++++++++++ vcpkg.json | 7 +- 9 files changed, 432 insertions(+), 7 deletions(-) create mode 100755 scripts/create_minio_credential_file.sh create mode 100755 scripts/upload_test_files_to_minio.sh create mode 100644 test/sql/cloud/minio_local/gcs_r2.test create mode 100644 test/sql/cloud/minio_local/minio_local.test diff --git a/.github/workflows/LocalTesting.yml b/.github/workflows/LocalTesting.yml index ecdc23c..a424ebf 100644 --- a/.github/workflows/LocalTesting.yml +++ b/.github/workflows/LocalTesting.yml @@ -18,6 +18,7 @@ jobs: VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake AZURE_STORAGE_CONNECTION_STRING: 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;' AZURE_STORAGE_ACCOUNT: devstoreaccount1 + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true steps: - uses: actions/checkout@v3 @@ -76,6 +77,92 @@ jobs: echo "## azurite" cat azurite_log.txt + minio-tests-linux: + name: Minio (local S3 test server) tests (Linux) + runs-on: ubuntu-latest + env: + S3_TEST_SERVER_AVAILABLE: 1 + GEN: ninja + VCPKG_TARGET_TRIPLET: x64-linux + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + submodules: 'true' + + - name: Checkout DuckDB to version + if: ${{ matrix.duckdb_version != ''}} + run: | + cd duckdb + git checkout ${{ matrix.duckdb_version }} + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install Ninja + shell: bash + run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build + + - name: Setup Ccache + uses: hendrikmuhs/ccache-action@main + with: + key: ${{ github.job }} + save: ${{ github.ref == 'refs/heads/main' || github.repository != 'duckdb/duckdb' }} + + - name: Setup vcpkg + uses: lukka/run-vcpkg@v11.1 + with: + vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6 + + - name: Build + shell: bash + run: make + + - name: Start S3/HTTP test server + shell: bash + run: | + cd duckdb + mkdir data/attach_test + touch data/attach_test/attach.db + sudo ./scripts/install_s3_test_server.sh + source ./scripts/run_s3_test_server.sh + sleep 30 + + - name: Write AWS credentials file + shell: bash + run: | + ./scripts/create_minio_credential_file.sh + + - name: Copy files to minio + shell: bash + env: + DUCKDB_MINIO_TEST_SERVER_AVAILABLE: 1 + AWS_ACCESS_KEY_ID: minio_duckdb_user + AWS_SECRET_ACCESS_KEY: minio_duckdb_user_password + AWS_DEFAULT_REGION: eu-west-1 + AWS_ENDPOINT: duckdb-minio.com:9000 + run: | + ./scripts/upload_test_files_to_minio.sh + + - name: Test + shell: bash + run: | + make test + + - name: Run Env tests + shell: bash + env: + DUCKDB_MINIO_TEST_SERVER_AVAILABLE: 1 + AWS_ACCESS_KEY_ID: minio_duckdb_user + AWS_SECRET_ACCESS_KEY: minio_duckdb_user_password + AWS_DEFAULT_REGION: eu-west-1 + AWS_ENDPOINT: duckdb-minio.com:9000 + run: | + ./build/release/test/unittest "*/test/sql/cloud/minio_local/*" + generated-tests-linux: name: Generated Tests (Linux) runs-on: ubuntu-latest diff --git a/extension-ci-tools b/extension-ci-tools index 71b8a60..d6d09ae 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit 71b8a603ea24b1ac8a2cff134aca28163576548f +Subproject commit d6d09ae94e71ae74d21f71bed5f9057accbb7505 diff --git a/extension_config.cmake b/extension_config.cmake index 16571c2..b2ba8c0 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -16,6 +16,13 @@ duckdb_extension_load(azure GIT_TAG 49b63dc8cd166952a0a34dfd54e6cfe5b823e05e ) +# Build the aws extension to test with credential providers +duckdb_extension_load(aws + LOAD_TESTS + GIT_URL https://github.com/duckdb/duckdb_aws + GIT_TAG 3d1f5c8d0127ff7aaf127935721b197e5fdd95e5 +) + # Build the tpch and tpcds extension for testing/benchmarking duckdb_extension_load(tpch) duckdb_extension_load(tpcds) diff --git a/scripts/create_minio_credential_file.sh b/scripts/create_minio_credential_file.sh new file mode 100755 index 0000000..c9f88d2 --- /dev/null +++ b/scripts/create_minio_credential_file.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Warning: overwrites your existing aws credentials file! + +# Set the file path for the credentials file +credentials_file=~/.aws/credentials + +# Set the file path for the config file +config_file=~/.aws/config + +# create dir if not already exists +mkdir -p ~/.aws + +# Create the credentials configuration +credentials_str="[default] +aws_access_key_id=minio_duckdb_user +aws_secret_access_key=minio_duckdb_user_password + +[minio-testing-2] +aws_access_key_id=minio_duckdb_user_2 +aws_secret_access_key=minio_duckdb_user_2_password + +[minio-testing-invalid] +aws_access_key_id=minio_duckdb_user_invalid +aws_secret_access_key=thispasswordiscompletelywrong +aws_session_token=completelybogussessiontoken +" + +# Write the credentials configuration to the file +echo "$credentials_str" > "$credentials_file" + +# Create the credentials configuration +config_str="[default] +region=eu-west-1 + +[profile minio-testing-2] +region=eu-west-1 + +[profile minio-testing-invalid] +region=the-moon-123 +" + +# Write the config to the file +echo "$config_str" > "$config_file" \ No newline at end of file diff --git a/scripts/upload_test_files_to_minio.sh b/scripts/upload_test_files_to_minio.sh new file mode 100755 index 0000000..c5723c7 --- /dev/null +++ b/scripts/upload_test_files_to_minio.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +aws s3 cp --endpoint-url http://duckdb-minio.com:9000 --recursive ./build/release/rust/src/delta_kernel/acceptance/tests/dat/out/reader_tests/generated "s3://test-bucket/dat" +aws s3 cp --endpoint-url http://duckdb-minio.com:9000 --recursive ./build/release/rust/src/delta_kernel/acceptance/tests/dat/out/reader_tests/generated "s3://test-bucket-public/dat" \ No newline at end of file diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 5dae760..5d50c8b 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -112,7 +112,14 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p ffi::EngineBuilder* builder; // For "regular" paths we early out with the default builder config - if (!StringUtil::StartsWith(path, "s3://") && !StringUtil::StartsWith(path, "azure://") && !StringUtil::StartsWith(path, "az://") && !StringUtil::StartsWith(path, "abfs://") && !StringUtil::StartsWith(path, "abfss://")) { + if (!StringUtil::StartsWith(path, "s3://") && + !StringUtil::StartsWith(path, "gcs://") && + !StringUtil::StartsWith(path, "gs://") && + !StringUtil::StartsWith(path, "r2://") && + !StringUtil::StartsWith(path, "azure://") && + !StringUtil::StartsWith(path, "az://") && + !StringUtil::StartsWith(path, "abfs://") && + !StringUtil::StartsWith(path, "abfss://")) { auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError); return KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path); } @@ -130,6 +137,33 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p bucket = path.substr(5, end_of_container-5); path_in_bucket = path.substr(end_of_container); secret_type = "s3"; + } else if (StringUtil::StartsWith(path, "gcs://")) { + auto end_of_container = path.find('/',6); + + if(end_of_container == string::npos) { + throw IOException("Invalid gcs url passed to delta scan: %s", path); + } + bucket = path.substr(6, end_of_container-6); + path_in_bucket = path.substr(end_of_container); + secret_type = "gcs"; + } else if (StringUtil::StartsWith(path, "gs://")) { + auto end_of_container = path.find('/',5); + + if(end_of_container == string::npos) { + throw IOException("Invalid gcs url passed to delta scan: %s", path); + } + bucket = path.substr(5, end_of_container-5); + path_in_bucket = path.substr(end_of_container); + secret_type = "gcs"; + } else if (StringUtil::StartsWith(path, "r2://")) { + auto end_of_container = path.find('/',5); + + if(end_of_container == string::npos) { + throw IOException("Invalid gcs url passed to delta scan: %s", path); + } + bucket = path.substr(5, end_of_container-5); + path_in_bucket = path.substr(end_of_container); + secret_type = "r2"; } else if ((StringUtil::StartsWith(path, "azure://")) || (StringUtil::StartsWith(path, "abfss://"))) { auto end_of_container = path.find('/',8); @@ -159,8 +193,18 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p secret_type = "azure"; } - auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError); - builder = KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path); + // We need to substitute DuckDB's usage of s3 and r2 paths because delta kernel needs to just interpret them as s3 protocol servers. + string cleaned_path; + if (StringUtil::StartsWith(path, "r2://") || StringUtil::StartsWith(path, "gs://") ) { + cleaned_path = "s3://" + path.substr(5); + } else if (StringUtil::StartsWith(path, "gcs://")) { + cleaned_path = "s3://" + path.substr(6); + } else { + cleaned_path = path; + } + + auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(cleaned_path), DuckDBEngineError::AllocateError); + builder = KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + cleaned_path); // For S3 or Azure paths we need to trim the url, set the container, and fetch a potential secret auto &secret_manager = SecretManager::Get(context); @@ -170,18 +214,24 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p // No secret: nothing left to do here! if (!secret_match.HasMatch()) { + if (StringUtil::StartsWith(path, "r2://") || StringUtil::StartsWith(path, "gs://") || StringUtil::StartsWith(path, "gcs://")) { + throw NotImplementedException("Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again."); + } + return builder; } const auto &kv_secret = dynamic_cast(*secret_match.secret_entry->secret); - // Here you would need to add the logic for setting the builder options for Azure // This is just a placeholder and will need to be replaced with the actual logic - if (secret_type == "s3") { + if (secret_type == "s3" || secret_type == "gcs" || secret_type == "r2") { auto key_id = kv_secret.TryGetValue("key_id").ToString(); auto secret = kv_secret.TryGetValue("secret").ToString(); auto session_token = kv_secret.TryGetValue("session_token").ToString(); auto region = kv_secret.TryGetValue("region").ToString(); + auto endpoint = kv_secret.TryGetValue("endpoint").ToString(); + auto use_ssl = kv_secret.TryGetValue("use_ssl").ToString(); + auto url_style = kv_secret.TryGetValue("url_style").ToString(); if (key_id.empty() && secret.empty()) { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("skip_signature"), KernelUtils::ToDeltaString("true")); @@ -196,6 +246,21 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p if (!session_token.empty()) { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_session_token"), KernelUtils::ToDeltaString(session_token)); } + if (!endpoint.empty() && endpoint != "s3.amazonaws.com") { + if(!StringUtil::StartsWith(endpoint, "https://") && !StringUtil::StartsWith(endpoint, "http://")) { + if(use_ssl == "1" || use_ssl == "NULL") { + endpoint = "https://" + endpoint; + } else { + endpoint = "http://" + endpoint; + } + } + + if (StringUtil::StartsWith(endpoint, "http://")) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("allow_http"), KernelUtils::ToDeltaString("true")); + } + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_endpoint"), KernelUtils::ToDeltaString(endpoint)); + } + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_region"), KernelUtils::ToDeltaString(region)); } else if (secret_type == "azure") { diff --git a/test/sql/cloud/minio_local/gcs_r2.test b/test/sql/cloud/minio_local/gcs_r2.test new file mode 100644 index 0000000..319380c --- /dev/null +++ b/test/sql/cloud/minio_local/gcs_r2.test @@ -0,0 +1,93 @@ +# name: test/sql/cloud/minio_local/gcs_r2.test +# description: test delta extension with GCS and R2 +# group: [aws] + +require httpfs + +require parquet + +require delta + +require aws + +require-env DUCKDB_MINIO_TEST_SERVER_AVAILABLE + +require-env AWS_ACCESS_KEY_ID + +require-env AWS_SECRET_ACCESS_KEY + +require-env AWS_DEFAULT_REGION + +require-env AWS_ENDPOINT + +statement ok +set secret_directory='__TEST_DIR__/minio_local_gcs_env' + +statement error +FROM delta_scan('gcs://test-bucket/dat/all_primitive_types/delta') +---- +Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again. + +statement error +FROM delta_scan('gs://test-bucket/dat/all_primitive_types/delta') +---- +Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again. + +statement error +FROM delta_scan('r2://test-bucket/dat/all_primitive_types/delta') +---- +Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again. + +# create a fake gcs secret +statement ok +CREATE SECRET ( + TYPE GCS, + KEY_ID '${AWS_ACCESS_KEY_ID}', + SECRET '${AWS_SECRET_ACCESS_KEY}', + REGION '${AWS_DEFAULT_REGION}', + ENDPOINT '${AWS_ENDPOINT}', + USE_SSL false +) + +query I +SELECT int32 +FROM delta_scan('gcs://test-bucket-public/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + +query I +SELECT int32 +FROM delta_scan('gs://test-bucket-public/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + +# create a fake r2 secret +statement ok +CREATE SECRET s1 ( + TYPE R2, + PROVIDER config, + account_id 'some_bogus_account', + KEY_ID '${AWS_ACCESS_KEY_ID}', + SECRET '${AWS_SECRET_ACCESS_KEY}', + REGION '${AWS_DEFAULT_REGION}', + ENDPOINT '${AWS_ENDPOINT}', + USE_SSL false +) + +query I +SELECT int32 +FROM delta_scan('r2://test-bucket-public/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 diff --git a/test/sql/cloud/minio_local/minio_local.test b/test/sql/cloud/minio_local/minio_local.test new file mode 100644 index 0000000..28031f7 --- /dev/null +++ b/test/sql/cloud/minio_local/minio_local.test @@ -0,0 +1,121 @@ +# name: test/sql/cloud/minio_local/aws_secret_chains_env.test +# description: test delta extension with a local minio installation +# group: [aws] + +require httpfs + +require parquet + +require delta + +require aws + +require-env DUCKDB_MINIO_TEST_SERVER_AVAILABLE + +require-env AWS_ACCESS_KEY_ID + +require-env AWS_SECRET_ACCESS_KEY + +require-env AWS_DEFAULT_REGION + +require-env AWS_ENDPOINT + +statement ok +set secret_directory='__TEST_DIR__/aws_secret_chains_env' + +# Secret with just the endpoint +statement ok +CREATE SECRET s1 ( + TYPE S3, + ENDPOINT '${AWS_ENDPOINT}', + USE_SSL false +); + +# We need auth for this +statement error +SELECT int32 +FROM delta_scan('s3://test-bucket/dat/all_primitive_types/delta') +---- +IO Error + +# unauthenticated query is fine! +query I +SELECT int32 +FROM delta_scan('s3://test-bucket-public/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + +statement ok +DROP SECRET S1; + +# Now we create a config secret with credentials +statement ok +CREATE SECRET s1 ( + TYPE S3, + PROVIDER config, + KEY_ID '${AWS_ACCESS_KEY_ID}', + SECRET '${AWS_SECRET_ACCESS_KEY}', + REGION '${AWS_DEFAULT_REGION}', + ENDPOINT '${AWS_ENDPOINT}', + USE_SSL false +); + +# Public bucket now does work +query I +SELECT int32 +FROM delta_scan('s3://test-bucket-public/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + +# Private bucket now does work too +query I +SELECT int32 +FROM delta_scan('s3://test-bucket/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + +statement ok +DROP SECRET S1; + +# Now we create a credential chain secret that searches the env vars automatically +statement ok +CREATE SECRET s1 ( + TYPE S3, + PROVIDER credential_chain, + ENDPOINT '${AWS_ENDPOINT}', + USE_SSL false +); + +# Still works! +query I +SELECT int32 +FROM delta_scan('s3://test-bucket-public/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + +# Still works! +query I +SELECT int32 +FROM delta_scan('s3://test-bucket/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 diff --git a/vcpkg.json b/vcpkg.json index 0cefd94..8e8245d 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -3,6 +3,11 @@ "azure-identity-cpp", "azure-storage-blobs-cpp", "azure-storage-files-datalake-cpp", - "openssl" + "openssl", + "zlib", + { + "name": "aws-sdk-cpp", + "features": [ "sts" ] + } ] } \ No newline at end of file