From bd2b9ae4ba5c02bb7f2a8cb122d128c5d64ef5e0 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 27 Jun 2024 17:35:01 +0200 Subject: [PATCH] add cardinality estimation --- src/functions/delta_scan.cpp | 19 ++++++++++++++++++- src/include/functions/delta_scan.hpp | 3 +++ test/sql/dat/basic_append.test | 19 +++++++++++++++++++ .../delta_kernel_rs/basic_partitioned.test | 5 +++++ 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index feae74c..ef84925 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -44,7 +44,7 @@ string url_decode(string input) { return result; } -static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats *, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) { +static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats * stats, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) { auto context = (DeltaSnapshot *) engine_context; auto path_string = context->GetPath(); StringUtil::RTrim(path_string, "/"); @@ -61,6 +61,7 @@ static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::Kernel // Initialize the file metadata context->metadata.back()->delta_snapshot_version = context->version; context->metadata.back()->file_number = context->resolved_files.size() - 1; + context->metadata.back()->cardinality = stats->num_records; // Fetch the deletion vector auto selection_vector_res = ffi::selection_vector_from_dv(dv_info, context->extern_engine.get(), context->global_state.get()); @@ -292,6 +293,22 @@ idx_t DeltaSnapshot::GetTotalFileCount() { return resolved_files.size(); } +unique_ptr DeltaSnapshot::GetCardinality(ClientContext &context) { + // This also ensures all files are expanded + auto total_file_count = DeltaSnapshot::GetTotalFileCount(); + + if (total_file_count == 0) { + return make_uniq(0,0); + } + + idx_t total_tuple_count = 0; + for (auto &metadatum : metadata) { + total_tuple_count += metadatum->cardinality; + } + + return make_uniq(total_tuple_count,total_tuple_count); +} + unique_ptr DeltaMultiFileReader::CreateInstance() { return std::move(make_uniq()); } diff --git a/src/include/functions/delta_scan.hpp b/src/include/functions/delta_scan.hpp index 07c782b..4f65578 100644 --- a/src/include/functions/delta_scan.hpp +++ b/src/include/functions/delta_scan.hpp @@ -28,6 +28,7 @@ struct DeltaFileMetaData { idx_t delta_snapshot_version = DConstants::INVALID_INDEX; idx_t file_number = DConstants::INVALID_INDEX; + idx_t cardinality = DConstants::INVALID_INDEX; ffi::KernelBoolSlice selection_vector = {nullptr, 0}; case_insensitive_map_t partition_map; }; @@ -49,6 +50,8 @@ struct DeltaSnapshot : public MultiFileList { FileExpandResult GetExpandResult() override; idx_t GetTotalFileCount() override; + unique_ptr GetCardinality(ClientContext &context) override; + protected: //! Get the i-th expanded file string GetFile(idx_t i) override; diff --git a/test/sql/dat/basic_append.test b/test/sql/dat/basic_append.test index 87930b8..2286eed 100644 --- a/test/sql/dat/basic_append.test +++ b/test/sql/dat/basic_append.test @@ -20,6 +20,12 @@ FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') ---- 5 +# Cardinality estimation should correctly show this +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +---- +physical_plan :.*EC: 5 .* + query I SELECT count(number) FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') @@ -78,3 +84,16 @@ SELECT a_float, number, letter FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') WHERE number > 6 ---- + +# Filters are reflected in cardinality estimation: filtering out all files shows 0 EC +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +WHERE number > 6 +---- +physical_plan :.*EC: 0 .* + +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +WHERE number > 4 +---- +physical_plan :.*EC: 1 .* diff --git a/test/sql/delta_kernel_rs/basic_partitioned.test b/test/sql/delta_kernel_rs/basic_partitioned.test index d66d012..4446477 100644 --- a/test/sql/delta_kernel_rs/basic_partitioned.test +++ b/test/sql/delta_kernel_rs/basic_partitioned.test @@ -17,3 +17,8 @@ e 5 5.5 a 1 1.1 b 2 2.2 c 3 3.3 + +query II +EXPLAIN FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned') +---- +physical_plan :.*EC: 6 .*